In [24]:
import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
%matplotlib inline
from scipy.special import comb

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import log_loss, make_scorer
from sklearn.model_selection import cross_val_score, train_test_split, GridSearchCV
from sklearn.datasets import load_breast_cancer, load_iris

In [2]:
ls

Baseline_model.ipynb       get_urls.ipynb
DataPreparation.py         get_urls.py
EDA.ipynb                  model.ipynb
README.md                  new_feature.csv
[34m__pycache__[m[m/               scraping_users_info.ipynb
[34mdata[m[m/                      [34msrc[m[m/
data_cleaned.csv           users_data.csv
data_cleaning.ipynb        [34mwebpage[m[m/
data_no_nan.csv


In [3]:
data = pd.read_csv('data_cleaned.csv')

In [6]:
data.head()

Unnamed: 0.1,Unnamed: 0,user_name,full_name,num_posts,num_followers,num_followings,is_private,is_business,is_joined_recently,biography,num_followers_float,following_follower_ratio,engagement_rate,label
0,0,wqwqwq12345w,1998,0.0,1,12.0,0,0,1,åÁNi recentid@ ni dolid@ acÌÁ te la bancas_ÙÕ£...,1.0,6.0,0.0,0
1,1,soymuyfalsa,soy muy falsa,5.0,62,1.0,0,0,0,0,62.0,0.015873,0.0,0
2,2,andrea_bordon_,andrea bordon,1.0,132,78.0,0,1,0,25/08/1999Monfalcone-Udine Italy_Ùà¨_Ùà_,132.0,0.586466,0.0,0
3,3,@ndah.endah.3701,@ndah.endah.3701 _ã¢ Instagram photos and videos,3.0,798,138.0,0,1,0,Haters make me famous_Ù÷È,798.0,0.172716,0.0,0
4,4,jolsuperstar_tz,jolsuperstartz,66.0,494,195.0,0,0,0,_ÙÓ¡FANSHII NDIO SPECIAL ACCOUNT YANGU KARIBUN...,494.0,0.393939,0.0,0


In [42]:
X = data.iloc[:, [3,5,6,7,8,10,11]]

In [30]:
y = data['label']

In [44]:
X.shape

(198, 7)

In [32]:
y.shape

(198,)

### train-test split

In [69]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=4)

In [70]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((148, 7), (50, 7), (148,), (50,))

### logistic regression

In [71]:
model = LogisticRegression()

In [72]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 148 entries, 84 to 122
Data columns (total 7 columns):
num_posts                   148 non-null float64
num_followings              148 non-null float64
is_private                  148 non-null int64
 is_business                148 non-null int64
 is_joined_recently         148 non-null int64
num_followers_float         148 non-null float64
following_follower_ratio    148 non-null float64
dtypes: float64(4), int64(3)
memory usage: 9.2 KB


In [73]:
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [74]:
y_pred = model.predict_proba(X_test)

In [75]:
log_loss1 = log_loss(y_test, y_pred)

In [76]:
log_loss1

0.6981216448661424

In [78]:
cv_logloss = -cross_val_score(model, X, y, scoring = 'neg_log_loss', cv = 10).mean()
cv_logloss

0.32261185172152895

### random forest

In [93]:
rf = RandomForestClassifier(n_estimators=100,
                            n_jobs=-1,
                            random_state=1)

In [94]:
rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=1, verbose=0, warm_start=False)

In [95]:
y_pred = rf.predict_proba(X_test)[:, 1]
y_pred

array([0.22, 0.27, 0.  , 0.43, 0.26, 0.  , 0.  , 0.65, 0.  , 0.  , 0.33,
       0.62, 0.  , 0.53, 0.13, 0.55, 0.  , 0.1 , 0.  , 0.  , 0.  , 0.  ,
       0.3 , 0.  , 0.12, 0.  , 0.12, 0.62, 0.01, 0.  , 0.  , 0.12, 0.01,
       0.  , 0.  , 0.  , 0.01, 0.  , 0.  , 0.  , 0.02, 0.  , 0.  , 0.  ,
       0.01, 0.  , 0.02, 0.12, 0.27, 0.08])

In [96]:
cv_logloss_rf1 = -cross_val_score(rf, X_train, y_train, cv = 10, scoring = 'neg_log_loss').mean()

In [97]:
cv_logloss_rf1

0.1457642122468058

### Best random forest

In [86]:
random_forest_grid = {'max_depth': [3, 4, None],
                      'max_features': ['sqrt', 'log2', None],
                      'min_samples_split': [2, 4],
                      'min_samples_leaf': [1, 2, 4],
                      'bootstrap': [True, False],
                      'n_estimators': [20, 40, 80, 100],
                      'random_state': [359]}

rf_gridsearch = GridSearchCV(RandomForestClassifier(),
                             random_forest_grid,
                             n_jobs=-1,
                             verbose=True,
                             scoring='neg_log_loss')
rf_gridsearch.fit(X_train, y_train)

print("best parameters:", rf_gridsearch.best_params_)

best_rf_model = rf_gridsearch.best_estimator_

Fitting 3 folds for each of 432 candidates, totalling 1296 fits


[Parallel(n_jobs=-1)]: Done 116 tasks      | elapsed:    7.2s
[Parallel(n_jobs=-1)]: Done 416 tasks      | elapsed:   24.2s
[Parallel(n_jobs=-1)]: Done 916 tasks      | elapsed:   51.8s
[Parallel(n_jobs=-1)]: Done 1296 out of 1296 | elapsed:  1.3min finished


best parameters: {'bootstrap': True, 'max_depth': 3, 'max_features': 'sqrt', 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 100, 'random_state': 359}


In [87]:
best_rf_model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=3, max_features='sqrt', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=359, verbose=0, warm_start=False)

In [88]:
y_pred_best = best_rf_model.predict_proba(X_test)[:, 1]


In [99]:
y_pred_best

array([2.29754658e-01, 1.93968278e-01, 8.38340291e-03, 3.88620649e-01,
       1.76440354e-01, 3.36408711e-04, 2.96942500e-03, 4.76212985e-01,
       1.14321685e-03, 3.93395463e-03, 2.19760040e-01, 5.84963903e-01,
       1.14321685e-03, 4.54465458e-01, 1.26176090e-01, 4.34407062e-01,
       3.36408711e-04, 1.05622985e-01, 2.96942500e-03, 7.53075378e-04,
       9.33457663e-03, 7.09219858e-05, 2.85011890e-01, 7.07238634e-03,
       2.73237032e-01, 2.96942500e-03, 1.26940478e-01, 4.76174573e-01,
       4.60663808e-02, 1.14321685e-03, 6.43593945e-03, 6.16790247e-02,
       1.51728138e-02, 6.53155106e-03, 2.96942500e-03, 2.80456585e-03,
       1.16720499e-02, 1.14321685e-03, 2.80456585e-03, 3.38148893e-03,
       8.38340291e-03, 2.38642313e-03, 1.14321685e-03, 2.73132976e-03,
       1.43339964e-02, 3.04266109e-03, 4.29834000e-02, 1.46706493e-01,
       1.65747722e-01, 1.32951143e-01])

In [91]:
cv_logloss_rf2 = -cross_val_score(best_rf_model, X_train, y_train, cv = 10, scoring = 'neg_log_loss').mean()

In [98]:
cv_logloss_rf2

0.14698773624506672