# Linear Models
This notebook will be mainly about the different linear models we tried to fit during the first steps into the project

---
## Logistic Regression:

In [3]:
import pandas as pd
import numpy as np
import pickle

#### Load the training tweets and the built GloVe word embeddings

In [5]:
sample_train_data = pd.read_csv('../cleaned_data/train_sample_data.csv')
sample_train_data.head()

Unnamed: 0,tweets,sign
0,ayee tell em ' bitches to come through,0.0
1,<user> <user> congrats on making it dude laug...,1.0
2,<user> yes ! i'd love to show you guys around ...,1.0
3,<user> we'll maybe meet there - bianca,0.0
4,<user> hello snooki follow me back please ? i ...,0.0


The sample_cleaned_emedding were created following the readme provided describing the Project 2.
Using `pos_train.txt`, `neg_train.txt` , `cooc.py` ...

In [7]:
embeddings = np.load("../glove_embeddings/cleaned_embeddings/sample/sample_cleaned_embeddings.npy")

In [8]:
embeddings

array([[-0.03639066, -0.09216319,  0.25672143, ...,  0.00438598,
         0.27709553, -0.28430814],
       [-0.11966957, -0.11714977,  0.31616412, ...,  0.03272727,
         0.33447519, -0.35110098],
       [-0.03528425, -0.17829468,  0.47392969, ..., -0.03130248,
         0.47083008, -0.49161173],
       ...,
       [ 1.90027776,  0.14359226,  1.00816857, ...,  0.78021657,
         1.24922489, -0.67362005],
       [-1.26125   ,  0.84964021, -0.49005907, ...,  0.30284872,
        -0.33666684,  1.03548732],
       [ 0.90150669, -0.12418292, -0.3043387 , ...,  0.00195154,
        -0.66822134,  0.13490772]])

In [None]:
vocab = pickle.load(open('../glove_embeddings/cleaned_embeddings/sample/cleaned_vocab.pkl', "rb")) #this file was generated by executing vocab.sh
word_embedding = {}
for key in vocab.keys():
    word_embedding[key] = embeddings[vocab.get(key)] # keys in this dict are not encoded 
word_embedding

In [10]:
word_embedding_df = pd.DataFrame(word_embedding).T

In [11]:
word_embedding_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
<user>,-0.036391,-0.092163,0.256721,0.338181,-0.121974,-0.139507,0.184167,-0.251719,-0.576921,-0.121730,0.302756,-0.304551,-0.325759,0.023690,-0.068419,-0.138550,0.070042,0.004386,0.277096,-0.284308
!,-0.119670,-0.117150,0.316164,0.530550,-0.216047,-0.211853,0.235751,-0.324863,-0.762572,-0.155339,0.429212,-0.426843,-0.436972,0.040640,-0.101456,-0.207414,0.105051,0.032727,0.334475,-0.351101
i,-0.035284,-0.178295,0.473930,0.673728,-0.274608,-0.290031,0.329904,-0.479382,-0.982926,-0.234918,0.569792,-0.617194,-0.510677,0.015466,-0.134536,-0.312960,0.116450,-0.031302,0.470830,-0.491612
you,-0.114303,-0.198763,0.535223,0.777906,-0.313000,-0.328890,0.368641,-0.518103,-1.090565,-0.243941,0.629524,-0.637401,-0.568887,0.027036,-0.187725,-0.365074,0.102648,-0.000199,0.510087,-0.503674
.,-0.063466,-0.205308,0.459918,0.609123,-0.259797,-0.293759,0.313210,-0.486077,-0.981628,-0.196206,0.508799,-0.520103,-0.495331,0.044647,-0.146040,-0.302999,0.117209,-0.036188,0.404599,-0.458328
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
#13,0.578525,0.092677,0.907422,-0.691363,0.490038,0.308752,-0.236962,1.382476,-1.155187,-0.330439,-1.882477,0.570032,-0.948262,0.039559,0.739215,-0.962181,-1.530525,0.159301,-1.720396,2.020875
#12,0.385156,-1.454628,1.161650,0.566216,2.103580,-0.386169,-0.027361,0.281791,-0.034381,-0.670752,0.183772,-1.114815,0.598311,-0.444628,-1.830605,0.519197,-0.322483,1.008614,1.952389,-0.772046
#100daystogo,-0.284537,0.066596,-0.502016,0.788958,0.115769,0.754378,0.279218,-0.179337,1.600295,0.917597,-1.725892,-0.180869,0.278357,-0.596489,0.018683,1.149776,-0.771335,0.552752,0.024422,0.559456
#100days,-0.408373,0.219682,0.812769,1.585777,1.134555,-1.355136,0.017074,0.000695,0.607982,0.539324,-1.133705,-0.923764,0.991551,0.108570,1.192866,-0.172446,0.771834,1.796897,-0.545241,-0.575005


In [12]:
sample_test_data = pd.read_csv("../cleaned_data/test_sample_data.csv")
sample_test_data

Unnamed: 0,tweets
0,sea doo pro sea scooter ( sports with the port...
1,<user> shucks well i work all week so now i ca...
2,i cant stay away from bug thats my baby
3,<user> no ma'am ! ! ! lol im perfectly fine an...
4,"whenever i fall asleep watching the tv , i alw..."
...,...
9995,had a nice time w / my friend lastnite
9996,<user> no it's not ! please stop !
9997,not without my daughter ( dvd two-time oscar (...
9998,<user> have fun in class sweetcheeks


- construct a feature representation of each training tweet (by averaging the word vectors over all words of the tweet)

In [13]:
def average_word_vectors(tweets ,word_embedding):
    
    error = 0
    avg_word_vectors = np.zeros((len(tweets), word_embedding.shape[1] ))
    for i, tweet in enumerate(tweets):
        
        split_tweet = tweet.split()
        nb_words = 0
        
        for word in split_tweet:
            try:
                avg_word_vectors[i] += word_embedding_df.loc[word].to_numpy()
                nb_words += 1

            except KeyError: 
                continue
        if (nb_words != 0):
            avg_word_vectors[i] /= nb_words
        
    return avg_word_vectors

In [14]:
avg_word_vectors = average_word_vectors(sample_train_data.tweets ,word_embedding_df)


In [15]:
avg_word_vectors_df = pd.DataFrame(avg_word_vectors)
avg_word_vectors_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.103042,-0.476258,0.253031,0.271344,-0.234250,-0.416527,0.350542,-0.263698,-0.677710,-0.348175,0.754546,-0.523776,-0.105592,0.051636,-0.148467,-0.423588,0.114603,0.056566,0.458621,-0.648659
1,-0.035343,-0.202305,0.569880,0.655770,-0.230325,-0.277664,0.348995,-0.435550,-0.946713,-0.233682,0.562428,-0.594078,-0.487470,0.059900,-0.177855,-0.337911,0.073457,-0.051864,0.510632,-0.386786
2,-0.063345,-0.098303,0.395130,0.665450,-0.270500,-0.271853,0.267416,-0.422232,-0.849598,-0.197942,0.555567,-0.544237,-0.445162,0.075814,-0.143498,-0.335021,0.127981,-0.023418,0.470948,-0.404352
3,0.084570,-0.304171,0.542734,0.674538,-0.562034,0.114750,0.536907,-0.515769,-0.733989,-0.135776,0.503222,-0.618604,-0.227896,0.100328,-0.111213,-0.220813,-0.043338,-0.014036,0.076931,-0.208996
4,-0.133802,-0.171259,0.459780,0.583461,-0.156678,-0.284474,0.412175,-0.353415,-0.850640,-0.278544,0.375492,-0.668245,-0.434267,-0.042149,-0.051464,-0.364874,-0.020598,-0.126889,0.458288,-0.335189
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,-0.153317,-0.185661,0.495620,0.516081,-0.258990,-0.348437,0.509613,-0.321200,-0.795003,-0.095673,0.428474,-0.540596,-0.459039,-0.074225,-0.094385,-0.147408,0.035848,-0.124784,0.258645,-0.516928
199996,-0.072823,-0.241911,0.320355,0.616920,-0.104754,-0.029998,0.192886,-0.748162,-0.665640,-0.080984,0.162043,-0.466616,-0.884462,0.012675,-0.140857,-0.187924,-0.098489,0.249598,0.202039,-0.454237
199997,-0.019281,-0.097291,0.470629,0.571047,-0.230550,-0.210472,0.365973,-0.324163,-0.774482,-0.112376,0.514962,-0.677924,-0.422658,0.014096,-0.116945,-0.269073,0.021149,-0.033446,0.600041,-0.371906
199998,-0.126683,-0.128776,0.439913,0.545141,-0.262770,-0.251818,0.208687,-0.350087,-0.787578,-0.232098,0.456932,-0.437572,-0.415643,0.034471,-0.173846,-0.294360,0.129467,-0.073206,0.416882,-0.499365


In [16]:
sample_train_word_vectors = pd.concat([sample_train_data,avg_word_vectors_df],axis=1)
sample_train_word_vectors

Unnamed: 0,tweets,sign,0,1,2,3,4,5,6,7,...,10,11,12,13,14,15,16,17,18,19
0,ayee tell em ' bitches to come through,0.0,0.103042,-0.476258,0.253031,0.271344,-0.234250,-0.416527,0.350542,-0.263698,...,0.754546,-0.523776,-0.105592,0.051636,-0.148467,-0.423588,0.114603,0.056566,0.458621,-0.648659
1,<user> <user> congrats on making it dude laug...,1.0,-0.035343,-0.202305,0.569880,0.655770,-0.230325,-0.277664,0.348995,-0.435550,...,0.562428,-0.594078,-0.487470,0.059900,-0.177855,-0.337911,0.073457,-0.051864,0.510632,-0.386786
2,<user> yes ! i'd love to show you guys around ...,1.0,-0.063345,-0.098303,0.395130,0.665450,-0.270500,-0.271853,0.267416,-0.422232,...,0.555567,-0.544237,-0.445162,0.075814,-0.143498,-0.335021,0.127981,-0.023418,0.470948,-0.404352
3,<user> we'll maybe meet there - bianca,0.0,0.084570,-0.304171,0.542734,0.674538,-0.562034,0.114750,0.536907,-0.515769,...,0.503222,-0.618604,-0.227896,0.100328,-0.111213,-0.220813,-0.043338,-0.014036,0.076931,-0.208996
4,<user> hello snooki follow me back please ? i ...,0.0,-0.133802,-0.171259,0.459780,0.583461,-0.156678,-0.284474,0.412175,-0.353415,...,0.375492,-0.668245,-0.434267,-0.042149,-0.051464,-0.364874,-0.020598,-0.126889,0.458288,-0.335189
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,<user> your a loser wink p hahahaha i'm not ...,1.0,-0.153317,-0.185661,0.495620,0.516081,-0.258990,-0.348437,0.509613,-0.321200,...,0.428474,-0.540596,-0.459039,-0.074225,-0.094385,-0.147408,0.035848,-0.124784,0.258645,-0.516928
199996,<user> he said thankyou,1.0,-0.072823,-0.241911,0.320355,0.616920,-0.104754,-0.029998,0.192886,-0.748162,...,0.162043,-0.466616,-0.884462,0.012675,-0.140857,-0.187924,-0.098489,0.249598,0.202039,-0.454237
199997,<user> i'll do national exam on monday till th...,1.0,-0.019281,-0.097291,0.470629,0.571047,-0.230550,-0.210472,0.365973,-0.324163,...,0.514962,-0.677924,-0.422658,0.014096,-0.116945,-0.269073,0.021149,-0.033446,0.600041,-0.371906
199998,<user> ready .. set .. here we go ! ! ! <user>...,0.0,-0.126683,-0.128776,0.439913,0.545141,-0.262770,-0.251818,0.208687,-0.350087,...,0.456932,-0.437572,-0.415643,0.034471,-0.173846,-0.294360,0.129467,-0.073206,0.416882,-0.499365


In [18]:
X_train = sample_train_word_vectors.drop(columns=["tweets","sign"])
X_train

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,0.103042,-0.476258,0.253031,0.271344,-0.234250,-0.416527,0.350542,-0.263698,-0.677710,-0.348175,0.754546,-0.523776,-0.105592,0.051636,-0.148467,-0.423588,0.114603,0.056566,0.458621,-0.648659
1,-0.035343,-0.202305,0.569880,0.655770,-0.230325,-0.277664,0.348995,-0.435550,-0.946713,-0.233682,0.562428,-0.594078,-0.487470,0.059900,-0.177855,-0.337911,0.073457,-0.051864,0.510632,-0.386786
2,-0.063345,-0.098303,0.395130,0.665450,-0.270500,-0.271853,0.267416,-0.422232,-0.849598,-0.197942,0.555567,-0.544237,-0.445162,0.075814,-0.143498,-0.335021,0.127981,-0.023418,0.470948,-0.404352
3,0.084570,-0.304171,0.542734,0.674538,-0.562034,0.114750,0.536907,-0.515769,-0.733989,-0.135776,0.503222,-0.618604,-0.227896,0.100328,-0.111213,-0.220813,-0.043338,-0.014036,0.076931,-0.208996
4,-0.133802,-0.171259,0.459780,0.583461,-0.156678,-0.284474,0.412175,-0.353415,-0.850640,-0.278544,0.375492,-0.668245,-0.434267,-0.042149,-0.051464,-0.364874,-0.020598,-0.126889,0.458288,-0.335189
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
199995,-0.153317,-0.185661,0.495620,0.516081,-0.258990,-0.348437,0.509613,-0.321200,-0.795003,-0.095673,0.428474,-0.540596,-0.459039,-0.074225,-0.094385,-0.147408,0.035848,-0.124784,0.258645,-0.516928
199996,-0.072823,-0.241911,0.320355,0.616920,-0.104754,-0.029998,0.192886,-0.748162,-0.665640,-0.080984,0.162043,-0.466616,-0.884462,0.012675,-0.140857,-0.187924,-0.098489,0.249598,0.202039,-0.454237
199997,-0.019281,-0.097291,0.470629,0.571047,-0.230550,-0.210472,0.365973,-0.324163,-0.774482,-0.112376,0.514962,-0.677924,-0.422658,0.014096,-0.116945,-0.269073,0.021149,-0.033446,0.600041,-0.371906
199998,-0.126683,-0.128776,0.439913,0.545141,-0.262770,-0.251818,0.208687,-0.350087,-0.787578,-0.232098,0.456932,-0.437572,-0.415643,0.034471,-0.173846,-0.294360,0.129467,-0.073206,0.416882,-0.499365


In [19]:
Y_train = sample_train_word_vectors.sign

In [20]:
Y_train.to_numpy()

array([0., 1., 1., ..., 1., 0., 0.])

In [21]:
X_test = sample_test_data
avg_word_vectors_test_df = pd.DataFrame(average_word_vectors(X_test.tweets ,word_embedding_df))

In [29]:
avg_word_vectors_test_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,-0.192925,-0.227320,0.531938,0.547458,-0.029546,-0.112131,0.177535,-0.155894,-0.665188,-0.311368,0.471397,-0.581751,-0.310495,0.047058,-0.352483,-0.348064,-0.194387,0.024943,0.149389,-0.378988
1,-0.105737,-0.160767,0.398062,0.659159,-0.267791,-0.235236,0.306701,-0.476188,-0.857286,-0.217737,0.459439,-0.639661,-0.425403,0.047152,-0.127996,-0.273561,0.023231,-0.089354,0.513522,-0.313904
2,-0.035192,-0.152306,0.482523,0.457505,-0.015983,-0.144561,0.206993,-0.505070,-0.737938,-0.413103,0.552791,-0.428667,-0.335365,0.081298,-0.249936,-0.276294,0.036955,-0.213982,0.603942,-0.422719
3,-0.177934,-0.083295,0.252004,0.577131,-0.376043,-0.251949,0.409394,-0.171015,-0.630986,-0.157952,0.465188,-0.520418,-0.268709,-0.060737,-0.092826,-0.208967,0.056133,0.004946,0.443746,-0.446448
4,-0.160648,-0.184098,0.346441,0.410458,-0.224065,-0.253801,0.338415,-0.520747,-0.698456,-0.148005,0.401808,-0.487821,-0.459717,0.068937,-0.204519,-0.150342,0.097417,-0.137441,0.448936,-0.506880
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,-0.063082,-0.126341,0.437498,0.675349,-0.300340,-0.314575,0.332507,-0.538570,-0.920960,-0.224309,0.603431,-0.625257,-0.436727,0.074245,-0.158525,-0.282326,0.140544,-0.080999,0.492803,-0.425586
9996,-0.083859,-0.111094,0.420981,0.631187,-0.279325,-0.222907,0.283491,-0.420281,-0.893414,-0.182745,0.518702,-0.514142,-0.476802,0.043555,-0.162426,-0.252175,0.056786,-0.050622,0.423405,-0.484523
9997,-0.125323,-0.205932,0.365661,0.434190,-0.104743,-0.125283,0.429559,-0.431185,-0.609905,-0.132508,0.499256,-0.596574,-0.356582,-0.122496,0.102354,0.002644,0.267052,0.113557,0.425755,-0.410532
9998,-0.110458,-0.105682,0.404172,0.537259,-0.262631,-0.320473,0.374739,-0.496816,-0.904411,-0.249470,0.433276,-0.496582,-0.509667,0.079502,-0.078198,-0.354974,0.131434,0.049244,0.507311,-0.418555


In [22]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

In [23]:
linear_reg = LogisticRegression()
linear_reg = linear_reg.fit(X_train, Y_train.to_numpy())
y_pred = linear_reg.predict(avg_word_vectors_test_df)



In [24]:
import csv

def create_csv_submission(ids, y_pred, name):
    """
    Creates an output file in csv format for submission to kaggle
    Arguments: ids (event ids associated with each prediction)
               y_pred (predicted class labels)
               name (string name of .csv output file to be created)
    """
    with open(name, 'w',newline='') as csvfile:
        fieldnames = ['Id', 'Prediction']
        writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
        writer.writeheader()
        for r1, r2 in zip(ids, y_pred):
            writer.writerow({'Id':int(r1),'Prediction':int(r2)})

In [26]:
y_pred = [-1 if x==0 else 1 for x in y_pred]

In [28]:
ids=np.arange(1,10001)
create_csv_submission(ids, y_pred, "csv_submission_logistic_reg.csv")

---

# Naive Bayes:

In [11]:
import numpy as np
import pandas as pd

In [12]:
pos_tweets = list(open("../cleaned_data/cleaned_train_pos.txt", "r", encoding='utf-8').readlines())
pos_tweets = [s.strip() for s in pos_tweets]
neg_tweets = list(open("../cleaned_data/cleaned_train_neg.txt", "r", encoding='utf-8').readlines())
neg_tweets = [s.strip() for s in neg_tweets]
test = list(open("../cleaned_data/cleaned_test_data.txt", "r", encoding='utf-8').readlines())
test = [s.strip() for s in test]

In [26]:
pos_tweets = [line.split(',') for line in pos_tweets]
neg_tweets = [line.split(',') for line in neg_tweets]
test_tweets =[line.split(',') for line in test]

In [27]:
pos_tweets = [ list(dict.fromkeys(tweet)) for tweet in pos_tweets]
neg_tweets = [ list(dict.fromkeys(tweet)) for tweet in neg_tweets]

In [28]:
word_list_pos_tweets = [word for tweet in pos_tweets for word in tweet]
word_list_neg_tweets = [word for tweet in neg_tweets for word in tweet]

In [29]:
proba_pos=0.5
proba_neg=0.5
log_prior_pos = np.log(proba_pos)
log_prior_neg = np.log(proba_neg)

In [30]:
word_list_pos_tweets = [word for word in word_list_pos_tweets if word!='']
word_list_neg_tweets = [word for word in word_list_neg_tweets if word!='']
all_words = word_list_pos_tweets + word_list_neg_tweets

In [31]:
def vocabulary_count(all_words, pos_neg_words):
    somme = 0
    for word in all_words:
        somme=somme+1+pos_neg_words.count(word)
    return somme


In [32]:
vocabulary_set = set(all_words)

In [33]:
from collections import Counter
pos_dict = dict(Counter(word_list_pos_tweets))
neg_dict = dict(Counter(word_list_neg_tweets))
all_word_dict = dict(Counter(all_words))

In [34]:
def compute_somme_pos(vocabulary_set,pos_dict):
    somme = len(vocabulary_set)
    for word in vocabulary_set:
        if word in pos_dict.keys(): somme+=pos_dict.get(word)
    return somme
        
def compute_somme_neg(vocabulary_set,neg_dict):
    somme = len(vocabulary_set)
    for word in vocabulary_set:
        if word in neg_dict.keys(): somme+=neg_dict.get(word)
    return somme

In [35]:
pos_somme = compute_somme_pos(vocabulary_set,pos_dict)
print(pos_somme)
neg_somme = compute_somme_neg(vocabulary_set,neg_dict)
print(neg_somme)

1316816
1451065


In [36]:
def loglikelihood_pos(somme,vocabulary,pos_dict):
    array = []
    for word in vocabulary:
        if word in pos_dict.keys():
            likelihood = (pos_dict.get(word) + 1 ) / somme
        else: likelihood= 1/somme
        array.append(np.log(likelihood))
    return array

def loglikelihood_neg(somme,vocabulary,neg_dict):
    array = []
    for word in vocabulary:
        if word in neg_dict.keys():
            likelihood = (neg_dict.get(word) + 1 ) / somme
        else: likelihood= 1/somme
        array.append(np.log(likelihood))
    return array

In [37]:
loglikelihood_pos_values = loglikelihood_pos(pos_somme,vocabulary_set,pos_dict)
loglikelihood_neg_values = loglikelihood_neg(pos_somme,vocabulary_set,neg_dict)

In [38]:
pos_likelihood_dict = dict(zip(list(vocabulary_set), loglikelihood_pos_values))
neg_likelihood_dict = dict(zip(list(vocabulary_set), loglikelihood_neg_values))

In [39]:
test_tweets =[list(dict.fromkeys(tweet)) for tweet in test_tweets]
test_tweets= [[word for word in tweet if word !='' and word in vocabulary_set] for tweet in test_tweets]


In [40]:
def function_naive_bayes(test,loglikelihood_pos,loglikelihood_neg):
    predictions = []
    for tweet in test:
        sum_pos = log_prior_pos
        sum_neg = log_prior_neg
        for word in tweet:
            sum_pos=sum_pos+loglikelihood_pos.get(word)
            sum_neg=sum_neg+loglikelihood_neg.get(word)
        if sum_pos>sum_neg :
            predictions.append(1)
        else: predictions.append(-1)
    return predictions

In [41]:
y_pred = function_naive_bayes(test_tweets,pos_likelihood_dict,neg_likelihood_dict)

In [43]:
from helpers import create_csv_submission

In [44]:
create_csv_submission(y_pred,"../submissions/naive_bayes_clean_fourth_attempt.csv")

**this gave us 0.75 accuracy and 0.758 F1 score**

---

# SVM:

In [None]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectFromModel
from time import time


#### Working on sample data:

In [5]:
pos_tweets, neg_tweets, test = load_cleaned_data(full=False, stop_words=False)

Ommiting repetitions
Translating emojis
removing numbers
adding <tag> for hashtags
tokenizing
removing pontuations
dealing with slang words


In [None]:
train_data, test_data = create_train_test_dfs(pos_tweets, neg_tweets, test)

In [None]:
def clean_df(df):
    df['cleaned_tweets'] = [" ".join(word) for word in df['tweets']]
    df['cleaned_tweets'] = df['cleaned_tweets'].apply(translate_emoji)
    df['cleaned_tweets'] = [" ".join(text_processor.pre_process_doc(tweet)) for tweet in df['cleaned_tweets']]
    return df

In [None]:
train_data = clean_df(train_data)
train_data.head()

Unnamed: 0,tweets,sign,cleaned_tweets
0,"[i, so, love, the, argentina, tvc, of, curt, s...",1.0,i so love the argentina tvc of curt sol gorg h...
1,"[he, poked, him, in, the, eye]",0.0,he poked him in the eye
2,"[well, the, blackhawks, lost, tonight, well, i...",0.0,well the blackhawks lost tonight well i guess ...
3,"[i, do, not, need, tht, multistop, get, it, ba...",1.0,i do not need tht multistop get it back get it...
4,"[user, following, back]",1.0,user following back


In [None]:
x = train_data['cleaned_tweets']
y = train_data['sign']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=.02, random_state=200)

In [None]:
checker_pipeline = Pipeline([('vectorizer',  TfidfVectorizer().set_params(
        stop_words=None,
        max_features=100000,
        ngram_range=(1, 3))),
                             ('classifier', Pipeline([('feature_selection',
               SelectFromModel(LinearSVC(penalty="l1", dual=False))),
              ('classification', LinearSVC(penalty="l2"))]))])

In [None]:
def accuracy_summary(pipeline, x_train, y_train, x_test, y_test):

    t0 = time()
    sentiment_fit = pipeline.fit(x_train, y_train)
    y_pred = sentiment_fit.predict(x_test)
    train_test_time = time() - t0
    accuracy = accuracy_score(y_test, y_pred)
    print("accuracy score: {0:.2f}%".format(accuracy * 100))
    print("train and test time: {0:.2f}s".format(train_test_time))
    print("-" * 80)
    return accuracy, train_test_time

In [None]:
accuracy_summary(
        checker_pipeline, x_train, y_train, x_test, y_test)

accuracy score: 83.45%
train and test time: 37.55s
--------------------------------------------------------------------------------


(0.8345, 37.55270862579346)

In [None]:
x_train = train_data['cleaned_tweets']
y_train = train_data['sign']

test_data = clean_df(test_data)
x_test = test_data['cleaned_tweets']

sentiment_fit = checker_pipeline.fit(x_train,y_train)
y_pred = sentiment_fit.predict(x_test)

In [None]:
y_pred = [-1 if pred == 0 else 1 for pred in y_pred]

In [None]:
create_csv_submission(y_pred,"SVM_trained_on_sample.csv") #0.824 Accuracy

#### Working on full data:

In [None]:
pos_tweets, neg_tweets, test = load_cleaned_data(full=True, stop_words=False)

Ommiting repetitions
Translating emojis
removing numbers
adding <tag> for hashtags
tokenizing
removing pontuations
dealing with slang words


In [None]:
full_train_data, test_data = create_train_test_dfs(pos_tweets, neg_tweets, test)

In [None]:
full_train_data = clean_df(full_train_data)
test_data = clean_df(test_data)

x_train = full_train_data['cleaned_tweets']
y_train = full_train_data['sign']

x_test = test_data['cleaned_tweets']

sentiment_fit = checker_pipeline.fit(x_train,y_train)
y_pred = sentiment_fit.predict(x_test)
y_pred = [-1 if pred == 0 else 1 for pred in y_pred]

create_csv_submission(y_pred,"SVM_trained_on_full_data.csv") #0.849 Accuracy F-1 0.853