In [1]:
%pip install xgboost

Note: you may need to restart the kernel to use updated packages.


In [2]:
import sklearn.linear_model as lm, pandas as pd
from sklearn.model_selection import KFold, cross_val_score
from numpy import mean

In [5]:
drop_list = ["handle", "id", "retweet_count", "reply_count", "interaction_score", "quote_count", "followers", "text_first_hashtag", "url_image", "text"]
LABEL = "like_count"

In [6]:
def get_data(url, drop=[]):
    import pandas as pd
    df = pd.read_csv(url)
    if len(drop) > 0:
        for col in drop:
            df.drop(columns=[col], inplace=True)
    return df

def bin_groups(df, percent=.05):
    import pandas as pd
    for col in df:
        if not pd.api.types.is_numeric_dtype(df[col]):
            for group, count in df[col].value_counts().iteritems():
                if count / len(df) < percent:
                    df.loc[df[col] == group, col] = 'Other'
    return df

def drop_columns_missing_data(df, cutoff=.5):
    import pandas as pd
    for col in df:
        if df[col].isna().sum() / len(df) > cutoff:
            df.drop(columns=[col], inplace=True)
    return df

def impute_mean(df):
    from sklearn.impute import SimpleImputer
    import pandas as pd, numpy as np
    for col in df:
        if not pd.api.types.is_numeric_dtype(df[col]):
            df = pd.get_dummies(df, columns=[col], drop_first=True)
    imp = SimpleImputer(missing_values=np.nan, strategy='mean')
    df = pd.DataFrame(imp.fit_transform(df), columns=df.columns)
    return df

def impute_KNN(df):
    from sklearn.impute import KNNImputer
    from sklearn.preprocessing import MinMaxScaler
    import pandas as pd
    for col in df:
        if not pd.api.types.is_numeric_dtype(df[col]):
            df = pd.get_dummies(df, columns=[col], drop_first=True)
    df = pd.DataFrame(MinMaxScaler().fit_transform(df), columns = df.columns)
    imp = KNNImputer(n_neighbors=5, weights="uniform")
    df = pd.DataFrame(imp.fit_transform(df), columns=df.columns)
    return df
        
def impute_reg(df):
    from sklearn.experimental import enable_iterative_imputer
    from sklearn.impute import IterativeImputer
    import pandas as pd
    for col in df:
        if not pd.api.types.is_numeric_dtype(df[col]):
            df = pd.get_dummies(df, columns=[col], drop_first=True)
    imp = IterativeImputer(max_iter=10, random_state=12345)
    df = pd.DataFrame(imp.fit_transform(df), columns=df.columns)
    return df

def fs_variance(df, label="", p=0.8):
    from sklearn.feature_selection import VarianceThreshold
    import pandas as pd

    if label != "":
        X = df.drop(columns=[label])
    
    sel = VarianceThreshold(threshold=(p * (1 - p)))
    sel.fit(X)

    # Add the label back in after removing poor features
    return df[df.columns[sel.get_support(indices=True)]].join(df[label])

def fit_crossvalidate_mlr(df, k, label, repeat=True):
    from sklearn.linear_model import LinearRegression
    from sklearn.model_selection import KFold, RepeatedKFold, cross_val_score
    import pandas as pd
    from numpy import mean, std
    X = df.drop(label,axis=1)
    y = df[label]
    if repeat:
        cv = RepeatedKFold(n_splits=k, n_repeats=5, random_state=12345)
    else:
        cv = KFold(n_splits=k, random_state=12345, shuffle=True)
    scores = cross_val_score(LinearRegression(), X, y, scoring='r2', cv=cv, n_jobs=-1)
    print(f'Average R-squared:\t{mean(scores)}')
    return LinearRegression().fit(X, y)

def dump_pickle(model, file_name):
    import pickle
    pickle.dump(model, open(file_name, "wb"))

def load_pickle(file_name):
    import pickle
    model = pickle.load(open(file_name, "rb"))
    return model


In [7]:
def fit_crossvalidate_clf(df, label, k=10, r=5, repeat=True):
    import sklearn.linear_model as lm, pandas as pd, sklearn.ensemble as se, numpy as np
    from sklearn.model_selection import KFold, RepeatedKFold, cross_val_score
    from numpy import mean, std
    from sklearn import svm
    from sklearn import gaussian_process
    from sklearn.gaussian_process.kernels import DotProduct, WhiteKernel
    from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
    from sklearn import svm
    from sklearn.naive_bayes import CategoricalNB
    from xgboost import XGBClassifier
    from sklearn import preprocessing
    from sklearn.neural_network import MLPClassifier
    
    X = df.drop(columns=[label])
    y = df[label]
    
    if repeat:
        cv = RepeatedKFold(n_splits=k, n_repeats=r, random_state=12345)
    else:
        cv = KFold(n_splits=k, random_state=12345, shuffle=True)
    
    fit = {}    # Use this to store each of the fit metrics
    models = {} # Use this to store each of the models
    
    # Create the model objects
    model_log = lm.LogisticRegression(max_iter=100)
    model_logcv = lm.RidgeClassifier()
    model_sgd = lm.SGDClassifier(max_iter=1000, tol=1e-3)
    model_pa = lm.PassiveAggressiveClassifier(max_iter=1000, random_state=12345, tol=1e-3)
    model_per = lm.Perceptron(fit_intercept=False, max_iter=10, tol=None, shuffle=False)
    model_knn = KNeighborsClassifier(n_neighbors=3)
    model_svm = svm.SVC(decision_function_shape='ovo') # Remove the parameter for two-class model
    model_nb = CategoricalNB()
    model_bag = se.BaggingClassifier(KNeighborsClassifier(), max_samples=0.5, max_features=0.5)
    model_ada = se.AdaBoostClassifier(n_estimators=100, random_state=12345)
    model_ext = se.ExtraTreesClassifier(n_estimators=100, random_state=12345)
    model_rf = se.RandomForestClassifier(n_estimators=10)
    model_hgb = se.HistGradientBoostingClassifier(max_iter=100)
    model_vot = se.VotingClassifier(estimators=[('lr', model_log), ('rf', model_ext), ('gnb', model_hgb)], voting='hard')
    model_gb = se.GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)
    estimators = [('ridge', lm.RidgeCV()), ('lasso', lm.LassoCV(random_state=12345)), ('knr', KNeighborsRegressor(n_neighbors=20, metric='euclidean'))]
    final_estimator = se.GradientBoostingRegressor(n_estimators=25, subsample=0.5, min_samples_leaf=25, max_features=1, random_state=12345)
    model_st = se.StackingRegressor(estimators=estimators, final_estimator=final_estimator)
    model_xgb = XGBClassifier()
    model_nn = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(5, 2), random_state=12345)
    
    # Fit a crss-validated R squared score and add it to the dict
    fit['Logistic'] = mean(cross_val_score(model_log, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
    fit['Ridge'] = mean(cross_val_score(model_logcv, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
    fit['SGD'] = mean(cross_val_score(model_sgd, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
    fit['PassiveAggressive'] = mean(cross_val_score(model_pa, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
    fit['Perceptron'] = mean(cross_val_score(model_per, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
    fit['KNN'] = mean(cross_val_score(model_knn, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
    fit['SVM'] = mean(cross_val_score(model_svm, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
    fit['NaiveBayes'] = mean(cross_val_score(model_nb, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
    fit['Bagging'] = mean(cross_val_score(model_bag, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
    fit['AdaBoost'] = mean(cross_val_score(model_ada, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
    fit['ExtraTrees'] = mean(cross_val_score(model_ext, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
    fit['RandomForest'] = mean(cross_val_score(model_rf, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
    fit['HistGradient'] = mean(cross_val_score(model_hgb, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
    fit['Voting'] = mean(cross_val_score(model_vot, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
    fit['GradBoost'] = mean(cross_val_score(model_gb, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
    fit['XGBoost'] = mean(cross_val_score(model_xgb, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
    fit['NeuralN'] = mean(cross_val_score(model_nn, X, y, scoring='accuracy', cv=cv, n_jobs=-1))
    
    # Add the model to another dict; make sure the keys have the same names as the list above
    models['Logistic'] = model_log
    models['Ridge'] = model_logcv
    models['SGD'] = model_sgd
    models['PassiveAggressive'] = model_pa
    models['Perceptron'] = model_per
    models['KNN'] = model_knn
    models['SVM'] = model_svm
    models['NaiveBayes'] = model_nb
    models['Bagging'] = model_bag
    models['AdaBoost'] = model_ada
    models['ExtraTrees'] = model_ext
    models['RandomForest'] = model_rf
    models['HistGradient'] = model_hgb
    models['Voting'] = model_vot
    models['GradBoost'] = model_gb
    models['XGBoost'] = model_xgb
    models['NeuralN'] = model_nn
    
        # Add the fit dictionary to a new DataFrame, sort, extract the top row, use it to retrieve the model object from the models dictionary
    df_fit = pd.DataFrame({'Accuracy':fit})
    df_fit.sort_values(by=['Accuracy'], ascending=False, inplace=True)
    best_model = df_fit.index[0]
    print(df_fit)
    
    return models[best_model].fit(X, y)

In [9]:
# Data cleaning and preparation pipeline
df = get_data('noRepies-topics.csv').drop(columns=drop_list)
df = bin_groups(df)
df = drop_columns_missing_data(df)

# Drop the label so it does not get dummy coded, then join it back in after
df = impute_mean(df.drop(columns=[LABEL])).join(df[LABEL])

# Feature selection and modeling pipeline
df = fs_variance(df, label=LABEL, p=.5)
model = fit_crossvalidate_clf(df, LABEL, 5, 2)

# Deployment pipeline
dump_pickle(model, 'best_clf_model.sav')

AttributeError: module 'sklearn.ensemble' has no attribute 'HistGradientBoostingClassifier'