In [2]:
import pandas as pd
import numpy as np
import sklearn
from sklearn import svm
from sklearn.neighbors import KNeighborsRegressor
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import mean_squared_error
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler


In [4]:
#Load the file in 
file_path = 'final_data.csv'
df = pd.read_csv(file_path)
print(df.head())  

     likes  comment_count  mediatype                        media_id  \
0   625977           5666          2  3229508169541433972_2278169415   
1  2699396          18964          8  3228722089224462980_2278169415   
2  1323071           4503          2  3228132418989080740_2278169415   
3  1277938           7324          2  3227301016087955284_2278169415   
4  1128553           7278          2  3226675054326949290_2278169415   

   sentiment_average  follower_count  
0             0.2002        46082571  
1             0.0513        46082571  
2             0.1687        46082571  
3             0.0672        46082571  
4             0.0968        46082571  


In [5]:
#Normalize the likes and the comments by dividing by follower count and number of likes
df['likes_norm'] = df['likes']/df['follower_count']
df['comment_norm'] = df['comment_count']/df['likes']
df2 = df.filter(['comment_norm','likes_norm','sentiment_average'], axis=1)
print(df2)

     comment_norm  likes_norm  sentiment_average
0        0.009051    0.013584             0.2002
1        0.007025    0.058577             0.0513
2        0.003403    0.028711             0.1687
3        0.005731    0.027731             0.0672
4        0.006449    0.024490             0.0968
..            ...         ...                ...
195      0.010014    0.006156             0.0680
196      0.012294    0.001887             0.1507
197      0.028111    0.001601            -0.0667
198      0.016325    0.003005            -0.0747
199      0.016761    0.001970             0.0401

[200 rows x 3 columns]


In [6]:
#Define the threshhold for a positive or negative sentiment
def threshold_function(value):
    return 0 if value > 0.001 else 1

In [7]:
#apply thershold function to the sentiment avg to determine sentiment
df2['sentiment_binary'] = df['sentiment_average'].apply(threshold_function)
print(df2)

     comment_norm  likes_norm  sentiment_average  sentiment_binary
0        0.009051    0.013584             0.2002                 0
1        0.007025    0.058577             0.0513                 0
2        0.003403    0.028711             0.1687                 0
3        0.005731    0.027731             0.0672                 0
4        0.006449    0.024490             0.0968                 0
..            ...         ...                ...               ...
195      0.010014    0.006156             0.0680                 0
196      0.012294    0.001887             0.1507                 0
197      0.028111    0.001601            -0.0667                 1
198      0.016325    0.003005            -0.0747                 1
199      0.016761    0.001970             0.0401                 0

[200 rows x 4 columns]


In [8]:
#Prepare data
X = df2[['likes_norm', 'comment_norm']]
y = df2['sentiment_binary']


In [9]:
#Create training and test data
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.4, random_state=0)

In [10]:
#Run gaussian naive bayes
clf = GaussianNB()
clf.fit(X_train, y_train)

In [11]:
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.12      0.21        69
           1       0.15      1.00      0.27        11

    accuracy                           0.24        80
   macro avg       0.58      0.56      0.24        80
weighted avg       0.88      0.24      0.22        80



In [12]:
confusion_matrix(y_test, y_pred)

array([[ 8, 61],
       [ 0, 11]], dtype=int64)

In [13]:
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [14]:
#Prepare and make Support Vector Classification
from sklearn.svm import SVC
classifier = SVC(kernel = 'rbf', random_state = 0)
classifier.fit(X_train, y_train)

In [15]:
y_pred = classifier.predict(X_test)

In [16]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test,y_pred)

[[69  0]
 [11  0]]


0.8625

In [17]:
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

In [18]:
#logistic regression settings
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.001,
          verbose=0, warm_start=False)

In [19]:
y_pred = log_reg.predict(X_test)

In [20]:
confusion_matrix(y_test, y_pred)

array([[69,  0],
       [11,  0]], dtype=int64)