The file comprises of data belonging to online bloggers in 'Twitch' regarding gaming and Twitch has been home to gaming streamers since its launch. The data contains top 1000 streamers from recent years. Data consists of number of columns like number of viewers, Number of active users, followers gained and other relevant coloumns 

In [1]:
# import packages
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from matplotlib import pyplot as plt
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from xgboost import XGBClassifier



np.random.seed(1)

In [2]:
df = pd.read_csv('./twitchdata.csv')
df.head(5)

Unnamed: 0,Channel,Watch time(Minutes),Stream time(minutes),Peak viewers,Average viewers,Followers,Follower Count,Followers gained,Views gained,Partnered,Mature,Language
0,Tfue,3671000070,123660,285644,29602,8938903,1,2068424,78998587,True,False,English
1,shroud,888505170,30240,471281,29612,7744066,1,833587,30621257,True,False,English
2,Myth,1479214575,134760,122552,9396,6726893,1,1421811,37384058,True,False,English
3,Rubius,2588632635,58275,240096,42948,5751354,1,3820532,58599449,True,False,Spanish
4,pokimane,964334055,56505,112160,16026,5367605,1,2085831,45579002,True,False,English


In [3]:
from sklearn import preprocessing

In [10]:
train_X = pd.read_csv("streamers_train_X.csv")
test_X = pd.read_csv("streamers_test_X.csv")
train_y = pd.read_csv("streamers_train_y.csv")
test_y = pd.read_csv("streamers_test_y.csv")

In [11]:
performance = pd.DataFrame({"model": [], "Accuracy": [], "Precision": [], "Recall": [], "F1": []})

In [12]:
log_reg_model = LogisticRegression(penalty='none', max_iter=900)
_ = log_reg_model.fit(X_train, np.ravel(y_train))

In [13]:
model_preds = log_reg_model.predict(X_test)
c_matrix = confusion_matrix(y_test, model_preds)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"default logistic", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])
performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,default logistic,0.993333,0.993548,0.993548,0.993548


In [14]:
#random search in log
score_measure = "recall"
kfolds = 5

param_grid = {
    'C':[0.1,1,10,100],
    'penalty': ['l2'],
    'solver': ['liblinear']
    
}

logreg = LogisticRegression()

rand_search_log = RandomizedSearchCV(estimator = logreg, param_distributions=param_grid, cv=kfolds, n_iter=500,
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = rand_search_log.fit(train_X, train_y)

print(f"The best {score_measure} score is {rand_search_log.best_score_}")
print(f"... with parameters: {rand_search_log.best_params_}")

bestRecallTree = rand_search_log.best_estimator_



Fitting 5 folds for each of 4 candidates, totalling 20 fits
The best recall score is 0.9710144927536231
... with parameters: {'solver': 'liblinear', 'penalty': 'l2', 'C': 100}


  y = column_or_1d(y, warn=True)


In [15]:
#svm with linear kernel
svm_lin_model = SVC(kernel="linear",probability=True)
_ = svm_lin_model.fit(X_train, np.ravel(y_train))

In [16]:
model_preds = svm_lin_model.predict(X_test)
c_matrix = confusion_matrix(y_test, model_preds)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"lin svm", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])
performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,default logistic,0.993333,0.993548,0.993548,0.993548
0,lin svm,0.953333,1.0,0.909677,0.952703


In [17]:
#svm with rbf kernel
svm_rbf_model = SVC(kernel="rbf", C=10, gamma='scale',probability=True)
_ = svm_rbf_model.fit(X_train, np.ravel(y_train))

In [18]:
model_preds = svm_rbf_model.predict(X_test)
c_matrix = confusion_matrix(y_test, model_preds)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"rbf svm", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])
performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,default logistic,0.993333,0.993548,0.993548,0.993548
0,lin svm,0.953333,1.0,0.909677,0.952703
0,rbf svm,0.94,0.992806,0.890323,0.938776


In [19]:
svm_poly_model = SVC(kernel="poly",probability=True, degree=3, coef0=1, C=10)
_ = svm_poly_model.fit(X_train, np.ravel(y_train))

In [20]:
model_preds = svm_poly_model.predict(X_test)
c_matrix = confusion_matrix(y_test, model_preds)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"poly svm", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])
performance


Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,default logistic,0.993333,0.993548,0.993548,0.993548
0,lin svm,0.953333,1.0,0.909677,0.952703
0,rbf svm,0.94,0.992806,0.890323,0.938776
0,poly svm,0.943333,0.985915,0.903226,0.942761


In [21]:
#random search in SVM
score_measure = "recall"
kfolds = 5

param_grid = {
    'C':[0.1,1,10,100],
    'gamma':[1,0.1,0.01,0.001],
    'kernel':['poly']
    
}

SVM_R_out = SVC()
rand_search = RandomizedSearchCV(estimator = SVM_R_out, param_distributions=param_grid, cv=kfolds, n_iter=500,
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = rand_search.fit(X_train,y_train)

print(f"The best {score_measure} score is {rand_search.best_score_}")
print(f"... with parameters: {rand_search.best_params_}")

bestRecallTree = rand_search.best_estimator_



Fitting 5 folds for each of 16 candidates, totalling 80 fits
The best recall score is 0.9130434782608695
... with parameters: {'kernel': 'poly', 'gamma': 1, 'C': 1}


  y = column_or_1d(y, warn=True)


In [23]:
model_preds = rand_search.predict(X_test)
c_matrix = confusion_matrix(y_test, model_preds)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"rand_poly_svm", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])
performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,default logistic,0.993333,0.993548,0.993548,0.993548
0,lin svm,0.953333,1.0,0.909677,0.952703
0,rbf svm,0.94,0.992806,0.890323,0.938776
0,poly svm,0.943333,0.985915,0.903226,0.942761
0,rand_poly_svm,0.926667,0.978417,0.877419,0.92517
0,rand_poly_svm,0.926667,0.978417,0.877419,0.92517


In [24]:
#grid search in SVM
score_measure = "recall"
kfolds = 5

param_grid = {
    'C':[0.1,1,10,100],
    'gamma':[1,0.1,0.01,0.001],
    'kernel':['poly']
    
}

SVM_G_out = SVC()
grid_search = GridSearchCV(estimator = SVM_G_out, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = grid_search.fit(train_X, train_y)

print(f"The best {score_measure} score is {grid_search.best_score_}")
print(f"... with parameters: {grid_search.best_params_}")

bestRecallTree = grid_search.best_estimator_

Fitting 5 folds for each of 16 candidates, totalling 80 fits
The best recall score is 0.9130434782608695
... with parameters: {'C': 1, 'gamma': 1, 'kernel': 'poly'}


  y = column_or_1d(y, warn=True)


In [25]:
model_preds = grid_search.predict(test_X)
c_matrix = confusion_matrix(test_y, model_preds)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"grid_poly_svm", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])
performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,default logistic,0.993333,0.993548,0.993548,0.993548
0,lin svm,0.953333,1.0,0.909677,0.952703
0,rbf svm,0.94,0.992806,0.890323,0.938776
0,poly svm,0.943333,0.985915,0.903226,0.942761
0,rand_poly_svm,0.926667,0.978417,0.877419,0.92517
0,rand_poly_svm,0.926667,0.978417,0.877419,0.92517
0,grid_poly_svm,0.926667,0.978417,0.877419,0.92517


In [26]:
#random with tree
score_measure = "recall"
kfolds = 5

param_grid = {
    'min_samples_split': np.arange(1,42),  
    'min_samples_leaf': np.arange(1,42),
    'min_impurity_decrease': np.arange(0.0001, 0.01, 0.0005),
    'max_leaf_nodes': np.arange(5, 42), 
    'max_depth': np.arange(1,42), 
    'criterion': ['entropy', 'gini'],
}

dtree = DecisionTreeClassifier()
rand_search_tree = RandomizedSearchCV(estimator = dtree, param_distributions=param_grid, cv=kfolds, n_iter=500,
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = rand_search_tree.fit(train_X, train_y)

print(f"The best {score_measure} score is {rand_search_tree.best_score_}")
print(f"... with parameters: {rand_search_tree.best_params_}")

bestRecallTree = rand_search_tree.best_estimator_

Fitting 5 folds for each of 500 candidates, totalling 2500 fits
The best recall score is 0.9971014492753623
... with parameters: {'min_samples_split': 30, 'min_samples_leaf': 5, 'min_impurity_decrease': 0.0031, 'max_leaf_nodes': 34, 'max_depth': 32, 'criterion': 'gini'}


60 fits failed out of a total of 2500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
60 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\davul\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\davul\anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 937, in fit
    super().fit(
  File "C:\Users\davul\anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 250, in fit
    raise ValueError(
ValueError: min_samples_split must be an integer greater than 1 or a float in (0.0, 1.0]; got the integer 1

 0.99710145 0.99710145 0.99710145 0.99710145 0.99710145 0.99710145
 0.997101

In [27]:
model_preds = rand_search_tree.predict(test_X)
c_matrix = confusion_matrix(test_y, model_preds)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"rand_poly_tree", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])
performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,default logistic,0.993333,0.993548,0.993548,0.993548
0,lin svm,0.953333,1.0,0.909677,0.952703
0,rbf svm,0.94,0.992806,0.890323,0.938776
0,poly svm,0.943333,0.985915,0.903226,0.942761
0,rand_poly_svm,0.926667,0.978417,0.877419,0.92517
0,rand_poly_svm,0.926667,0.978417,0.877419,0.92517
0,grid_poly_svm,0.926667,0.978417,0.877419,0.92517
0,rand_poly_tree,1.0,1.0,1.0,1.0


In [28]:
#grid with tree
score_measure = "recall"
kfolds = 5

param_grid = {
    'min_samples_split': np.arange(3,6),  
    'min_samples_leaf': np.arange(3,6),
    'min_impurity_decrease': np.arange(0.0009, 0.0012,0.0001),
    'max_leaf_nodes': np.arange(36,40), 
    'max_depth': np.arange(39,42), 
    'criterion': ['entropy'],
}

dtree = DecisionTreeClassifier()
grid_search_tree = GridSearchCV(estimator = dtree, param_grid=param_grid, cv=kfolds, 
                           scoring=score_measure, verbose=1, n_jobs=-1,  # n_jobs=-1 will utilize all available CPUs 
                           return_train_score=True)

_ = grid_search_tree.fit(train_X, train_y)

print(f"The best {score_measure} score is {grid_search_tree.best_score_}")
print(f"... with parameters: {grid_search_tree.best_params_}")

bestRecallTree = grid_search_tree.best_estimator_

Fitting 5 folds for each of 324 candidates, totalling 1620 fits
The best recall score is 0.9971014492753623
... with parameters: {'criterion': 'entropy', 'max_depth': 39, 'max_leaf_nodes': 36, 'min_impurity_decrease': 0.0009, 'min_samples_leaf': 3, 'min_samples_split': 3}


In [29]:
model_preds = grid_search_tree.predict(test_X)
c_matrix = confusion_matrix(test_y, model_preds)
TP = c_matrix[1][1]
TN = c_matrix[0][0]
FP = c_matrix[0][1]
FN = c_matrix[1][0]
performance = pd.concat([performance, pd.DataFrame({'model':"grid_poly_tree", 
                                                    'Accuracy': [(TP+TN)/(TP+TN+FP+FN)], 
                                                    'Precision': [TP/(TP+FP)], 
                                                    'Recall': [TP/(TP+FN)], 
                                                    'F1': [2*TP/(2*TP+FP+FN)]
                                                     }, index=[0])])
performance

Unnamed: 0,model,Accuracy,Precision,Recall,F1
0,default logistic,0.993333,0.993548,0.993548,0.993548
0,lin svm,0.953333,1.0,0.909677,0.952703
0,rbf svm,0.94,0.992806,0.890323,0.938776
0,poly svm,0.943333,0.985915,0.903226,0.942761
0,rand_poly_svm,0.926667,0.978417,0.877419,0.92517
0,rand_poly_svm,0.926667,0.978417,0.877419,0.92517
0,grid_poly_svm,0.926667,0.978417,0.877419,0.92517
0,rand_poly_tree,1.0,1.0,1.0,1.0
0,grid_poly_tree,1.0,1.0,1.0,1.0


After executing the dataset with all the predictive models. The best predictive model with recall score is random search with poly tree and grid search with poly tree where performing well .