#### Import all the primary non-ML packages

In [1]:
import csv
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import time

%matplotlib inline

#### Set the local path

In [2]:
import subprocess
import os

cmd = subprocess.Popen('pwd', stdout=subprocess.PIPE)
cmd_out, cmd_err = cmd.communicate()
local_path = os.fsdecode(cmd_out).strip()

#### Load the data in 'Data_Preprocessed folder'

In [3]:
df=pd.read_parquet(local_path+"/erdos_twitter_project/Data_Preprocessed/df_tweets_Shashank_features_added_part1.parquet")
df=pd.concat([df,pd.read_parquet(local_path+"/erdos_twitter_project/Data_Preprocessed/df_tweets_Shashank_features_added_part2.parquet")])

#### Drop textual, categorical data

In [4]:
dropthese=['created_at','created_at_user','text','Company_name','media_type']
df=df.drop(dropthese,axis=1)
df.head(5)

Unnamed: 0,entities_cashtags,entities_hashtags,entities_urls,public_metrics_like_count,public_metrics_quote_count,public_metrics_reply_count,public_metrics_retweet_count,entities_mentions,public_metrics_followers_count,public_metrics_following_count,...,Word_count_LM11_pos,Word_count_LM11_neg,Word_count_Hagenau13_pos,Word_count_Hagenau13_neg,Tweet_Length_characters,Tweet_Length_words,Compound_vader,Positive_vader,Negative_vader,Neutral_vader
0,1,3,1,0,0,0,0,0,187956,75,...,0,0,0,0,119,21,0.0,0.0,0.0,1.0
1,1,3,1,0,0,0,0,0,187956,75,...,0,0,0,0,159,31,0.0,0.0,0.0,1.0
2,0,0,1,6,1,0,1,0,2276616,515,...,0,0,0,0,94,12,0.0,0.0,0.0,1.0
3,1,3,1,0,0,0,0,0,187956,75,...,0,0,0,0,156,31,0.0,0.0,0.0,1.0
4,1,3,1,0,0,0,0,0,187956,75,...,0,0,0,1,98,19,-0.2732,0.0,0.149,0.851


#### We choose the tweets with like counts higher than 20 to be popular -- assign binary classes based on that

In [5]:
y=[]
for k in df.public_metrics_like_count:
    if k>20:
        y.append(1)
    else:
        y.append(0)
        
X = df.drop(['public_metrics_like_count'],axis=1)

In [6]:
print(f'fraction of tweets more than 20 likes and those less than 20 likes: {sum(y)/len(y)} , {1-(sum(y)/len(y))}')

fraction of tweets more than 20 likes and those less than 20 likes: 0.5560581855240382 , 0.4439418144759618


#### Machine learning setup

In [7]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

scaler = StandardScaler()
pca=PCA(n_components=20)
knn_classifier =  KNeighborsClassifier()

pipe = Pipeline(steps=[
    ('scaler',scaler),
    ('dimensionality_reduction',pca),
    ('classifier',knn_classifier)
])

# The scorers can be either one of the predefined metric strings or a scorer
# callable, like the one returned by make_scorer
scoring = {"accuracy_score": make_scorer(accuracy_score),"f1_scorer": make_scorer(f1_score, average="weighted")}


model = GridSearchCV(
    pipe,
    param_grid = {
    'classifier__n_neighbors':range(5,11)},
    scoring=scoring,
    refit="f1_scorer",
    return_train_score=True,
    cv=10,
    n_jobs=2
)

#### Create train-test split and fit the model

In [None]:
from sklearn.model_selection import train_test_split
import sklearn.metrics


start = time.perf_counter()

X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                      test_size = .2,
                                                      random_state=123,
                                                      shuffle=True,
                                                      stratify=y)
model.fit(X_train, y_train)

finish = time.perf_counter()
print(f'time taken : {finish-start}s')

In [None]:
model.best_score_

In [None]:
model.best_params_

In [None]:
results=model.cv_results_