In [306]:
import pandas as pd
import numpy as np

In [363]:
# Load the initial data and rename the id as user_id
data = pd.read_csv('data/2000_followers_graines_version_2021_09_21 (1).csv', index_col = [0])
data = data.rename(columns = {'id':'user_id'})
list_user = list(data.user_id)

# Load and concat the embeddings
emb = {}
emb['bert'] = np.load('embeddings/bert.npy')
emb['features'] = np.load('embeddings/features.npy')
emb['profile_pictures'] = np.load('embeddings/full_profile_pictures.npy')
emb['tfidf'] = np.load('embeddings/tfidf.npy')

full_list = (emb['tfidf'], emb['profile_pictures'], emb['features'], emb['bert'])

X = np.concatenate(full_list, axis=1)
X = pd.DataFrame(X)

X = pd.DataFrame(emb['tfidf'])

In [364]:
# Load and concat the annotations
data_1 = pd.read_csv('data/data annotated/project-116-at-2021-10-11-12-14-ffdbfe39.csv')
data_2 = pd.read_csv('data/data annotated/project-118-at-2021-10-11-12-13-e2172a5e.csv')

df_ann = pd.concat([data_1, data_2])

In [365]:
# Merge the initial data and the results of the annotations based on similar various keys
key = ['screen_name', 'name', 'description', 'protected', 'location']
merged = pd.merge(data[key + ['user_id']], df_ann, on = key, how = 'left')

# Deal with the common data (I just randomly keep one )
merged = merged.drop_duplicates(['user_id'], keep='first')
merged = merged[['user_id', 'sentiment']]
merged = merged.reset_index(drop=True)

# Deal with data that have not been annotated
merged_fin = merged[merged.sentiment.notna()].reset_index(drop=True)

# Erase the values with the X
index_notnan = merged.index[merged['sentiment'].notna()]
X_fin = np.array(X.iloc[index_notnan])

# Get the Y labels
map_code = {'non-graine':0, 'graine':1}
merged_fin['sentiment'] = merged_fin['sentiment'].map(map_code)
y_fin = np.array(merged_fin.sentiment)

# Save the data
np.save('data/final_X.npy', X_fin)
np.save('data/final_y.npy', y_fin)

In [380]:
import pandas as pd
import numpy as np
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_fscore_support, euclidean_distances
from sklearn.svm import SVC


def triangular_kernel(X, Y):
    return 1 - abs(euclidean_distances(X, Y))


classifiers = {
    "SVM_triangular_kernel": SVC(kernel=triangular_kernel, C=3),
    "SVM_RBF_kernel": SVC(),
}



In [390]:
X_train, X_test, y_train, y_test = train_test_split(X_fin, y_fin, test_size=0.7, random_state=26)

clf = classifiers['SVM_triangular_kernel']
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print(pd.DataFrame(y_pred).value_counts())

precision, recall, f1, _ = precision_recall_fscore_support(
                y_test, y_pred, pos_label=1, average="binary"
            )
print(precision, recall, f1)

0    1205
1       2
dtype: int64
0.5 0.011904761904761904 0.023255813953488372
