<a href="https://colab.research.google.com/github/paolofiorio/Micro-Influencer-Classifier/blob/main/7_TwitterModelSelection.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# IMPORTS

In [None]:
import pandas as pd
import csv
import os
pd.set_option('display.max_rows', None)
import sklearn
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer,ENGLISH_STOP_WORDS,TfidfTransformer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import GridSearchCV
import torch
!pip install xgboost==1.5.1
from xgboost import XGBClassifier
import pickle

In [None]:
# Clone github repository or import csv files

In [None]:
df = pd.read_csv('Thesis/cleanTweetsSentiment.csv')
df.head()

# MICRO CLASSIFICATION SECTION

In [None]:
df1 = df.drop(columns=['id','screen_name','topic','language','scores','microTopic','tweets'])

df1.head()

## RANDOM FOREST

In [None]:
y = df1['micro']
X = df1.loc[:, ["followers", "age", "followers_growth_rate", "followers_following_ratio", "tweet_freq","interactions_no_retweets","topicInTweetsPercentage", "topicInWordsPercentage", 
"positiveSentiment","neutralSentiment" ,"negativeSentiment"]]
X = X.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X,y ,test_size=0.2, random_state=42)

param_grid = {
    'bootstrap': [True,False],
    'max_depth': [None,10],
    'max_features': ['auto','sqrt'],
    'criterion': ['gini','entropy'],
    'min_samples_leaf':[1,2],
    'min_samples_split': [2,3],
    'n_estimators': [200,300]
}# Create a based model

rf = RandomForestClassifier()# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                           n_jobs = -1, verbose = 2)

grid_search.fit(X_train, y_train)
grid_search.best_params_

In [None]:
y = df1['micro']
X = df1.loc[:, ["followers", "age", "followers_growth_rate", "followers_following_ratio", "tweet_freq","interactions_no_retweets","topicInTweetsPercentage", "topicInWordsPercentage", 
"positiveSentiment","neutralSentiment" ,"negativeSentiment"]]
X = X.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X,y ,test_size=0.2, random_state=42)

pipeline = Pipeline([
    ('clf', RandomForestClassifier(n_estimators=200, min_samples_split=2, min_samples_leaf=1, criterion = 'gini',
                                   max_features='auto', max_depth=None, bootstrap=True)),
])

# train classifier
pipeline.fit(X_train, y_train)

# evaluate all steps on test set
predicted = pipeline.predict(X_test)

print("RFC Accuracy Score -> ",accuracy_score(predicted, y_test)*100)
print("Train accuracy ->", accuracy_score(pipeline.predict(X_train), y_train)*100)
tn, fp, fn, tp = confusion_matrix(y_test,predicted).ravel()
print("tn, fp, fn, tp", (tn, fp, fn, tp) )

target_names = ['not_micro_influencer', 'micro_influencer']
print(classification_report(y_test, predicted, target_names=target_names))

## XGBOOST

In [None]:
y = df1['micro']
X = df1.loc[:, ["followers", "age", "followers_growth_rate", "followers_following_ratio", "tweet_freq","interactions_no_retweets","topicInTweetsPercentage", "topicInWordsPercentage", 
"positiveSentiment","neutralSentiment" ,"negativeSentiment"]]
X = X.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X,y ,test_size=0.2, random_state=42)

param_grid = {
    'booster': ['gbtree'],
    'learning_rate':[0.01, 0.3],
    'max_depth': [2,4,6],
    'min_child_weight':[0.01, 0.5,1],
    'gamma': [0, 0.5],
    'n_estimators': [100,1000],
    'alpha': [0,0.5],
}# Create a based model
rf = XGBClassifier()# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                           n_jobs = -1, verbose = 2)


grid_search.fit(X_train, y_train)
grid_search.best_params_

In [None]:
y = df1['micro']
X = df1.loc[:, ["followers", "age", "followers_growth_rate", "followers_following_ratio", "tweet_freq","interactions_no_retweets","topicInTweetsPercentage", "topicInWordsPercentage", 
"positiveSentiment","neutralSentiment" ,"negativeSentiment"]]
X = X.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X,y ,test_size=0.2, random_state=42)

model = XGBClassifier(alpha=0, booster ='gbtree',gamma = 0, learning_rate=0.01, max_depth=4, min_child_weight=0.01, n_estimators= 100 )
# train classifier
model.fit(X_train, y_train)

# evaluate all steps on test set
predicted = model.predict(X_test)

print("RFC Accuracy Score -> ",accuracy_score(predicted, y_test)*100)
print("Train accuracy ->", accuracy_score(model.predict(X_train), y_train)*100)
tn, fp, fn, tp = confusion_matrix(y_test,predicted).ravel()
print("tn, fp, fn, tp", (tn, fp, fn, tp) )

target_names = ['not_micro_influencer', 'micro_influencer']
print(classification_report(y_test, predicted, target_names=target_names))

model.save_model("modelMicro.json")

In [None]:
Pkl_Filename = "Pickle_XGB_Model.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(model, file)

In [None]:
with open(Pkl_Filename, 'rb') as file:  
    Pickled_XGB_Model = pickle.load(file)

Pickled_XGB_Model

In [None]:
score = Pickled_XGB_Model.score(X_test, y_test)  
# Print the Score
print("Test score: {0:.2f} %".format(100 * score))  

Ypredict = Pickled_XGB_Model.predict(X_test)  

Ypredict

In [None]:
prova = df1.loc[1:2, ["followers", "age", "followers_growth_rate", "followers_following_ratio", "tweet_freq","interactions_no_retweets","topicInTweetsPercentage", "topicInWordsPercentage", 
"positiveSentiment","neutralSentiment" ,"negativeSentiment"]]
prova = prova.to_numpy()
#print(X_test)
#print(prova)
Ypredict = Pickled_XGB_Model.predict(prova)  

Ypredict

## SVC

In [None]:
y = df1['micro']
X = df1.loc[:, ["followers", "age", "followers_growth_rate", "followers_following_ratio", "tweet_freq","interactions_no_retweets","topicInTweetsPercentage", "topicInWordsPercentage", 
"positiveSentiment","neutralSentiment" ,"negativeSentiment"]]
X = X.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X,y ,test_size=0.2, random_state=42)

param_grid = {
    'penalty': ['l1','l2'],
    'loss':['hinge', 'squared_hinge'],
    'C': [0.1,0.5,1,10,100,1000,2000],
    'dual': [False]
   
}# Create a based model
rf = svm.LinearSVC()# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                           n_jobs = -1, verbose = 2)


grid_search.fit(X_train, y_train)
grid_search.best_params_

In [None]:
y = df1['micro']
X = df1.loc[:, ["followers", "age", "followers_growth_rate", "followers_following_ratio", "tweet_freq","interactions_no_retweets","topicInTweetsPercentage", "topicInWordsPercentage", 
"positiveSentiment","neutralSentiment" ,"negativeSentiment"]]
X = X.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X,y ,test_size=0.2, random_state=42)

pipeline = Pipeline([
    ('clf', svm.LinearSVC(C=0.1, loss='squared_hinge', penalty= 'l1', dual = False)),
])

# train classifier
pipeline.fit(X_train, y_train)

# evaluate all steps on test set
predicted = pipeline.predict(X_test)

print("LINEAR SVC PIPELINE Accuracy Score -> ",accuracy_score(predicted, y_test)*100)
print("Train accuracy ->", accuracy_score(pipeline.predict(X_train), y_train)*100)
tn, fp, fn, tp = confusion_matrix(y_test,predicted).ravel()
print("tn, fp, fn, tp", (tn, fp, fn, tp) )

target_names = ['not_micro_influencer', 'micro_influencer']
print(classification_report(y_test, predicted, target_names=target_names))

## MLP

In [None]:
y = df1['micro']
X = df1.loc[:, ["followers", "age", "followers_growth_rate", "followers_following_ratio", "tweet_freq","interactions_no_retweets","topicInTweetsPercentage", "topicInWordsPercentage", 
"positiveSentiment","neutralSentiment" ,"negativeSentiment"]]
X = X.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X,y ,test_size=0.2, random_state=42)

param_grid = {
    'activation': ['identity','logistic', 'tanh', 'relu'],
    'solver':['lbfgs', 'sgd', 'adam'],
    'learning_rate': ['constant', 'invscaling'],
    'max_iter': [500,600]
    
   
}# Create a based model
rf = MLPClassifier()# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                           n_jobs = -1, verbose = 2)


grid_search.fit(X_train, y_train)
grid_search.best_params_

In [None]:
y = df1['micro']
X = df1.loc[:, ["followers", "age", "followers_growth_rate", "followers_following_ratio", "tweet_freq","interactions_no_retweets","topicInTweetsPercentage", "topicInWordsPercentage", 
"positiveSentiment","neutralSentiment" ,"negativeSentiment"]]
X = X.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X,y ,test_size=0.2, random_state=42)

pipeline = Pipeline([
    ('clf', MLPClassifier(activation='logistic', learning_rate='invscaling', max_iter=500, solver='adam')),
])

# train classifier
pipeline.fit(X_train, y_train)

# evaluate all steps on test set
predicted = pipeline.predict(X_test)

print("MLP PIPELINE Accuracy Score -> ",accuracy_score(predicted, y_test)*100)
print("Train accuracy ->", accuracy_score(pipeline.predict(X_train), y_train)*100)
tn, fp, fn, tp = confusion_matrix(y_test,predicted).ravel()
print("tn, fp, fn, tp", (tn, fp, fn, tp) )

target_names = ['not_micro_influencer', 'micro_influencer']
print(classification_report(y_test, predicted, target_names=target_names))

## LOGISTIC REGRESSION

In [None]:
y = df1['micro']
X = df1.loc[:, ["followers", "age", "followers_growth_rate", "followers_following_ratio", "tweet_freq","interactions_no_retweets","topicInTweetsPercentage", "topicInWordsPercentage", 
"positiveSentiment","neutralSentiment" ,"negativeSentiment"]]
X = X.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X,y ,test_size=0.2, random_state=42)

param_grid = {
    'penalty': ['l1','l2', 'elasticnet', 'none'],
    'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'C': [1,10,100,1000]
    
   
}# Create a based model
rf = LogisticRegression()# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                           n_jobs = -1, verbose = 2)


grid_search.fit(X_train, y_train)
grid_search.best_params_

In [None]:
y = df1['micro']
X = df1.loc[:, ["followers", "age", "followers_growth_rate", "followers_following_ratio", "tweet_freq","interactions_no_retweets","topicInTweetsPercentage", "topicInWordsPercentage", 
"positiveSentiment","neutralSentiment" ,"negativeSentiment"]]
X = X.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X,y ,test_size=0.2, random_state=42)

pipeline = Pipeline([
 ('clf', LogisticRegression(C=10, penalty = 'l2', solver = 'liblinear')),
])

# train classifier
pipeline.fit(X_train, y_train)

# evaluate all steps on test set
predicted = pipeline.predict(X_test)

print("LOG REG PIPELINE Accuracy Score -> ",accuracy_score(predicted, y_test)*100)
print("Train accuracy ->", accuracy_score(pipeline.predict(X_train), y_train)*100)
tn, fp, fn, tp = confusion_matrix(y_test,predicted).ravel()
print("tn, fp, fn, tp", (tn, fp, fn, tp) )

target_names = ['not_micro_influencer', 'micro_influencer']
print(classification_report(y_test, predicted, target_names=target_names))

## SGD

In [None]:
y = df1['micro']
X = df1.loc[:, ["followers", "age", "followers_growth_rate", "followers_following_ratio", "tweet_freq","interactions_no_retweets","topicInTweetsPercentage", "topicInWordsPercentage", 
"positiveSentiment","neutralSentiment" ,"negativeSentiment"]]
X = X.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X,y ,test_size=0.2, random_state=42)

param_grid = {
    'penalty': ['l1','l2', 'elasticnet'],
    'learning_rate': ['constant', 'invscaling','optimal','adaptive'],
    
   
}# Create a based model
rf = SGDClassifier()# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                           n_jobs = -1, verbose = 2)


grid_search.fit(X_train, y_train)
grid_search.best_params_

In [None]:
y = df1['micro']
X = df1.loc[:, ["followers", "age", "followers_growth_rate", "followers_following_ratio", "tweet_freq","interactions_no_retweets","topicInTweetsPercentage", "topicInWordsPercentage", 
"positiveSentiment","neutralSentiment" ,"negativeSentiment"]]
X = X.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X,y ,test_size=0.2, random_state=42)

pipeline = Pipeline([
  ('clf', SGDClassifier(learning_rate='optimal', penalty='elasticnet')),
])

# train classifier
pipeline.fit(X_train, y_train)

# evaluate all steps on test set
predicted = pipeline.predict(X_test)

print("SGD PIPELINE Accuracy Score -> ",accuracy_score(predicted, y_test)*100)
print("Train accuracy ->", accuracy_score(pipeline.predict(X_train), y_train)*100)
tn, fp, fn, tp = confusion_matrix(y_test,predicted).ravel()
print("tn, fp, fn, tp", (tn, fp, fn, tp) )

target_names = ['not_micro_influencer', 'micro_influencer']
print(classification_report(y_test, predicted, target_names=target_names))

# MICRO TOPIC CLASSIFICATION SECTION

In [None]:
df2 = df.drop(columns=['id','screen_name','topic','language','scores','micro','tweets'])

df2.head()

## RANDOM FOREST

In [None]:
y = df2['microTopic']
X = df2.loc[:, ["followers", "age", "followers_growth_rate", "followers_following_ratio", "tweet_freq","interactions_no_retweets","topicInTweetsPercentage", "topicInWordsPercentage", 
"positiveSentiment","neutralSentiment" ,"negativeSentiment"]]
X = X.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X,y ,test_size=0.2, random_state=42)

param_grid = {
    'bootstrap': [True,False],
    'max_depth': [None,10],
    'max_features': ['auto','sqrt'],
    'criterion': ['gini','entropy'],
    'min_samples_leaf':[1,2],
    'min_samples_split': [2,3],
    'n_estimators': [200,300]
}# Create a based model

rf = RandomForestClassifier()# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                           n_jobs = -1, verbose = 2)


grid_search.fit(X_train, y_train)
grid_search.best_params_

In [None]:
y = df2['microTopic']
X = df2.loc[:, ["followers", "age", "followers_growth_rate", "followers_following_ratio", "tweet_freq","interactions_no_retweets","topicInTweetsPercentage", "topicInWordsPercentage", 
"positiveSentiment","neutralSentiment" ,"negativeSentiment"]]
X = X.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X,y ,test_size=0.2, random_state=42)

pipeline = Pipeline([
    ('clf', RandomForestClassifier(n_estimators=300, min_samples_split=2, min_samples_leaf=1, criterion = 'gini',
                                   max_features='sqrt', max_depth=10, bootstrap=False)),
])

# train classifier
pipeline.fit(X_train, y_train)

# evaluate all steps on test set
predicted = pipeline.predict(X_test)

print("RFC Accuracy Score -> ",accuracy_score(predicted, y_test)*100)
print("Train accuracy ->", accuracy_score(pipeline.predict(X_train), y_train)*100)
tn, fp, fn, tp = confusion_matrix(y_test,predicted).ravel()
print("tn, fp, fn, tp", (tn, fp, fn, tp) )

target_names = ['not_micro_influencer', 'micro_influencer']
print(classification_report(y_test, predicted, target_names=target_names))

## XGBOOST

In [None]:
y = df2['microTopic']
X = df2.loc[:, ["followers", "age", "followers_growth_rate", "followers_following_ratio", "tweet_freq","interactions_no_retweets","topicInTweetsPercentage", "topicInWordsPercentage", 
"positiveSentiment","neutralSentiment" ,"negativeSentiment"]]
X = X.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X,y ,test_size=0.2, random_state=42)

param_grid = {
    'booster': ['gbtree'],
    'learning_rate':[0.01, 0.3],
    'max_depth': [3,4,5],
    'min_child_weight':[0.01, 0.5,1],
    'gamma': [0, 0.5,1],
    'n_estimators': [500,1000],
    'alpha': [0,0.05],
}# Create a based model
rf = XGBClassifier()# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                           n_jobs = -1, verbose = 2)


grid_search.fit(X_train, y_train)
grid_search.best_params_

In [None]:
y = df2['microTopic']
X = df2.loc[:, ["followers", "age", "followers_growth_rate", "followers_following_ratio", "tweet_freq","interactions_no_retweets","topicInTweetsPercentage", "topicInWordsPercentage", 
"positiveSentiment","neutralSentiment" ,"negativeSentiment"]]
X = X.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X,y ,test_size=0.2, random_state=42)

model = XGBClassifier(alpha=0, booster ='gbtree',gamma = 0.5, learning_rate=0.01, max_depth=4, min_child_weight=0.5, n_estimators= 500 )
# train classifier
model.fit(X_train, y_train)

# evaluate all steps on test set
predicted = model.predict(X_test)

print("RFC Accuracy Score -> ",accuracy_score(predicted, y_test)*100)
print("Train accuracy ->", accuracy_score(model.predict(X_train), y_train)*100)
tn, fp, fn, tp = confusion_matrix(y_test,predicted).ravel()
print("tn, fp, fn, tp", (tn, fp, fn, tp) )

target_names = ['not_micro_influencer', 'micro_influencer']
print(classification_report(y_test, predicted, target_names=target_names))
model.save_model("modelMicroTopic.json")

In [None]:
Pkl_Filename = "Pickle_XGB_ModelMicroTopic.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(model, file)

## SVC

In [None]:
y = df2['microTopic']
X = df2.loc[:, ["followers", "age", "followers_growth_rate", "followers_following_ratio", "tweet_freq","interactions_no_retweets","topicInTweetsPercentage", "topicInWordsPercentage", 
"positiveSentiment","neutralSentiment" ,"negativeSentiment"]]
X = X.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X,y ,test_size=0.2, random_state=42)

param_grid = {
    'penalty': ['l1','l2'],
    'loss':['hinge', 'squared_hinge'],
    'C': [0.1,0.5,1,10,100,1000,2000],
    'dual': [False]
}# Create a based model
rf = svm.LinearSVC()# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                           n_jobs = -1, verbose = 2)


grid_search.fit(X_train, y_train)
grid_search.best_params_

In [None]:
y = df2['microTopic']
X = df2.loc[:, ["followers", "age", "followers_growth_rate", "followers_following_ratio", "tweet_freq","interactions_no_retweets","topicInTweetsPercentage", "topicInWordsPercentage", 
"positiveSentiment","neutralSentiment" ,"negativeSentiment"]]
X = X.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X,y ,test_size=0.2, random_state=42)

pipeline = Pipeline([
    ('clf', svm.LinearSVC(C=10, loss='squared_hinge', penalty= 'l1', dual = False)),
])

# train classifier
pipeline.fit(X_train, y_train)

# evaluate all steps on test set
predicted = pipeline.predict(X_test)

print("LINEAR SVC PIPELINE Accuracy Score -> ",accuracy_score(predicted, y_test)*100)
print("Train accuracy ->", accuracy_score(pipeline.predict(X_train), y_train)*100)
tn, fp, fn, tp = confusion_matrix(y_test,predicted).ravel()
print("tn, fp, fn, tp", (tn, fp, fn, tp) )

target_names = ['not_micro_influencer', 'micro_influencer']
print(classification_report(y_test, predicted, target_names=target_names))

## MLP

In [None]:
y = df2['microTopic']
X = df2.loc[:, ["followers", "age", "followers_growth_rate", "followers_following_ratio", "tweet_freq","interactions_no_retweets","topicInTweetsPercentage", "topicInWordsPercentage", 
"positiveSentiment","neutralSentiment" ,"negativeSentiment"]]
X = X.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X,y ,test_size=0.2, random_state=42)

param_grid = {
    'activation': ['identity','logistic', 'tanh', 'relu'],
    'solver':['lbfgs', 'sgd', 'adam'],
    'learning_rate': ['constant', 'invscaling'],
    'max_iter': [500,600]
    
   
}# Create a based model
rf = MLPClassifier()# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                           n_jobs = -1, verbose = 2)


grid_search.fit(X_train, y_train)
grid_search.best_params_

In [None]:
y = df2['microTopic']
X = df2.loc[:, ["followers", "age", "followers_growth_rate", "followers_following_ratio", "tweet_freq","interactions_no_retweets","topicInTweetsPercentage", "topicInWordsPercentage", 
"positiveSentiment","neutralSentiment" ,"negativeSentiment"]]
X = X.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X,y ,test_size=0.2, random_state=42)

pipeline = Pipeline([
    ('clf', MLPClassifier(activation='logistic', learning_rate='constant', max_iter=500, solver='adam')),
])

# train classifier
pipeline.fit(X_train, y_train)

# evaluate all steps on test set
predicted = pipeline.predict(X_test)

print("MLP PIPELINE Accuracy Score -> ",accuracy_score(predicted, y_test)*100)
print("Train accuracy ->", accuracy_score(pipeline.predict(X_train), y_train)*100)
tn, fp, fn, tp = confusion_matrix(y_test,predicted).ravel()
print("tn, fp, fn, tp", (tn, fp, fn, tp) )

target_names = ['not_micro_influencer', 'micro_influencer']
print(classification_report(y_test, predicted, target_names=target_names))

## LOGISTIC REGRESSION

In [None]:
y = df2['microTopic']
X = df2.loc[:, ["followers", "age", "followers_growth_rate", "followers_following_ratio", "tweet_freq","interactions_no_retweets","topicInTweetsPercentage", "topicInWordsPercentage", 
"positiveSentiment","neutralSentiment" ,"negativeSentiment"]]
X = X.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X,y ,test_size=0.2, random_state=42)

param_grid = {
    'penalty': ['l1','l2', 'elasticnet', 'none'],
    'solver':['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'C': [1,10,100,1000]
    
   
}# Create a based model
rf = LogisticRegression()# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                           n_jobs = -1, verbose = 2)


grid_search.fit(X_train, y_train)
grid_search.best_params_

In [None]:
y = df2['microTopic']
X = df2.loc[:, ["followers", "age", "followers_growth_rate", "followers_following_ratio", "tweet_freq","interactions_no_retweets","topicInTweetsPercentage", "topicInWordsPercentage", 
"positiveSentiment","neutralSentiment" ,"negativeSentiment"]]
X = X.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X,y ,test_size=0.2, random_state=42)

pipeline = Pipeline([
    ('clf', LogisticRegression(C=100, penalty='l1', solver='liblinear')),
])

# train classifier
pipeline.fit(X_train, y_train)

# evaluate all steps on test set
predicted = pipeline.predict(X_test)

print("LOG REG PIPELINE Accuracy Score -> ",accuracy_score(predicted, y_test)*100)
print("Train accuracy ->", accuracy_score(pipeline.predict(X_train), y_train)*100)
tn, fp, fn, tp = confusion_matrix(y_test,predicted).ravel()
print("tn, fp, fn, tp", (tn, fp, fn, tp) )

target_names = ['not_micro_influencer', 'micro_influencer']
print(classification_report(y_test, predicted, target_names=target_names))

## SGD

In [None]:
y = df2['microTopic']
X = df2.loc[:, ["followers", "age", "followers_growth_rate", "followers_following_ratio", "tweet_freq","interactions_no_retweets","topicInTweetsPercentage", "topicInWordsPercentage", 
"positiveSentiment","neutralSentiment" ,"negativeSentiment"]]
X = X.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X,y ,test_size=0.2, random_state=42)

param_grid = {
    'penalty': ['l1','l2', 'elasticnet'],
    'learning_rate': ['constant', 'invscaling','optimal','adaptive'],
    
   
}# Create a based model
rf = SGDClassifier()# Instantiate the grid search model
grid_search = GridSearchCV(estimator = rf, param_grid = param_grid, 
                           n_jobs = -1, verbose = 2)


grid_search.fit(X_train, y_train)
grid_search.best_params_

In [None]:
y = df2['microTopic']
X = df2.loc[:, ["followers", "age", "followers_growth_rate", "followers_following_ratio", "tweet_freq","interactions_no_retweets","topicInTweetsPercentage", "topicInWordsPercentage", 
"positiveSentiment","neutralSentiment" ,"negativeSentiment"]]
X = X.to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X,y ,test_size=0.2, random_state=42)

pipeline = Pipeline([
    ('clf', SGDClassifier(learning_rate='optimal', penalty='l1')),
])

# train classifier
pipeline.fit(X_train, y_train)

# evaluate all steps on test set
predicted = pipeline.predict(X_test)

print("SGD PIPELINE Accuracy Score -> ",accuracy_score(predicted, y_test)*100)
print("Train accuracy ->", accuracy_score(pipeline.predict(X_train), y_train)*100)
tn, fp, fn, tp = confusion_matrix(y_test,predicted).ravel()
print("tn, fp, fn, tp", (tn, fp, fn, tp) )

target_names = ['not_micro_influencer', 'micro_influencer']
print(classification_report(y_test, predicted, target_names=target_names))