## Preprocessing

In [2]:
import pandas as pd
from sklearn.metrics import classification_report
import numpy as np

df = pd.read_csv('data/twitter-2013train-A.txt', sep='\t', header=-1, encoding='utf-8')
df.columns = ['id', 'label', 'tweet']
df["label"].apply(lambda x: x.strip())
df['label'] = df['label'].map({'positive': 1, 'negative': -1, 'neutral': 0})
df

Unnamed: 0,id,label,tweet
0,264183816548130816,1,Gas by my house hit $3.39!!!! I\u2019m going t...
1,263405084770172928,-1,Theo Walcott is still shit\u002c watch Rafa an...
2,262163168678248449,-1,its not that I\u2019m a GSP fan\u002c i just h...
3,264249301910310912,-1,Iranian general says Israel\u2019s Iron Dome c...
4,262682041215234048,0,Tehran\u002c Mon Amour: Obama Tried to Establi...
5,264229576773861376,0,I sat through this whole movie just for Harry ...
6,264105751826538497,1,with J Davlar 11th. Main rivals are team Polan...
7,264094586689953794,-1,Talking about ACT\u2019s && SAT\u2019s\u002c d...
8,212392538055778304,0,"Why is \""""Happy Valentines Day\"""" trending? It..."
9,254941790757601280,-1,They may have a SuperBowl in Dallas\u002c but ...


In [2]:
df["tweet"]

0       Gas by my house hit $3.39!!!! I\u2019m going t...
1       Theo Walcott is still shit\u002c watch Rafa an...
2       its not that I\u2019m a GSP fan\u002c i just h...
3       Iranian general says Israel\u2019s Iron Dome c...
4       Tehran\u002c Mon Amour: Obama Tried to Establi...
5       I sat through this whole movie just for Harry ...
6       with J Davlar 11th. Main rivals are team Polan...
7       Talking about ACT\u2019s && SAT\u2019s\u002c d...
8       Why is \""Happy Valentines Day\"" trending? It...
9       They may have a SuperBowl in Dallas\u002c but ...
10      Im bringing the monster load of candy tomorrow...
11      Apple software\u002c retail chiefs out in over...
12      @oluoch @victor_otti @kunjand I just watched i...
13      One of my best 8th graders Kory was excited af...
14      #Livewire Nadal confirmed for Mexican Open in ...
15      @MsSheLahY I didnt want to just pop up... but ...
16      @Alyoup005 @addicted2haley hmmmm  November is ...
17      #Iran 

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk import tokenize

vectorizer_ngrams = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = "english", \
                             min_df = 3, \
                             ngram_range = (1,3), \
                             max_features = 10000)

vectorizer_chars = CountVectorizer(analyzer = "char",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = "english", \
                             min_df = 3, \
                             ngram_range = (2,5), \
                             max_features = 10000)

features_ngrams = vectorizer_ngrams.fit_transform(df["tweet"])
features_chars = vectorizer_chars.fit_transform(df["tweet"])

In [4]:
vocab_ngrams = vectorizer_ngrams.get_feature_names()
vocab_chars = vectorizer_chars.get_feature_names()
print len(vocab_ngrams)
print len(vocab_chars)

print vocab_ngrams[:100]
print vocab_chars[:100]
# print vocab_ngrams

9888
10000
[u'00', u'00 pm', u'00 u002c', u'000', u'00am', u'00pm', u'01', u'02', u'03', u'039', u'04', u'05', u'06', u'07', u'08', u'09', u'10', u'10 11', u'10 17', u'10 18', u'10 2011', u'10 27', u'10 30', u'10 30am', u'10 http', u'10 million', u'10 years', u'100', u'1000', u'101', u'105', u'106', u'106 park', u'10am', u'10pm', u'10pm itv2', u'10pm itv2 hurrraaaaah', u'10th', u'10th annual', u'10th birthday', u'10th grade', u'10th kansas', u'10th kansas nationwide', u'10th time', u'10th time season', u'11', u'11 02', u'11 12', u'11 30', u'11 http', u'11am', u'11pm', u'11pm iparty', u'11th', u'11th 2012', u'11th international', u'11th international cloud', u'12', u'12 u002c', u'120', u'12pm', u'12th', u'13', u'134', u'13th', u'13th desperation', u'13th desperation day', u'14', u'14 u002c', u'14th', u'15', u'15 2012', u'15 http', u'15 places', u'15 places 16', u'150', u'15pm', u'15th', u'15th august', u'15th august anthem', u'16', u'16 august', u'16 august isk', u'16 official', u'16 of

In [10]:
print features_ngrams[0], df["tweet"][0]
print vocab_ngrams[1614]

  (0, 1614)	1
  (0, 8985)	1
  (0, 7153)	1
  (0, 3805)	1
  (0, 1613)	1
  (0, 3404)	1
  (0, 8976)	1
  (0, 281)	1
  (0, 3813)	1
  (0, 3907)	1
  (0, 3316)	1 Gas by my house hit $3.39!!!! I\u2019m going to Chapel Hill on Sat. :)
chapel hill


In [6]:
print len(filter(lambda x: len(x.split()) == 1, vocab_ngrams))
print len(filter(lambda x: len(x.split()) == 2, vocab_ngrams))
print len(filter(lambda x: len(x.split()) == 3, vocab_ngrams))
print
print len(filter(lambda x: len(x) == 2, vocab_chars))
print len(filter(lambda x: len(x) == 3, vocab_chars))
print len(filter(lambda x: len(x) == 4, vocab_chars))
print len(filter(lambda x: len(x) == 5, vocab_chars))

5194
3651
1043

799
2783
3768
2650


## Model training: word and character n-grams

In [17]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
from sklearn.svm import LinearSVC
import time

In [7]:
df_test = pd.read_csv('data/twitter-2013test-A.txt', sep='\t', header=-1)
df_test.columns = ['id', 'label', 'tweet']
df_test["label"].apply(lambda x: x.strip())
df_test['label'] = df_test['label'].map({'positive': 1, 'negative': -1, 'neutral': 0})
df_test

Unnamed: 0,id,label,tweet
0,264238274963451904,1,"@jjuueellzz down in the Atlantic city, ventnor..."
1,218775148495515649,1,Musical awareness: Great Big Beautiful Tomorro...
2,258965201766998017,0,On Radio786 100.4fm 7:10 Fri Oct 19 Labour ana...
3,262926411352903682,-1,"Kapan sih lo ngebuktiin,jan ngomong doang Susa..."
4,171874368908050432,0,"Excuse the connectivity of this live stream, f..."
5,256010056942903296,1,Show your LOVE for your local field & it might...
6,253809989599232000,0,"Milton on Bolton Wanderers 2 v 2 Leeds United,..."
7,261776619146985472,1,@firecore Can you tell me when an update for t...
8,264143999374356481,1,"@Heavensbasement The Crown, Filthy McNastys, K..."
9,223052929131757571,0,Uncover the Eternal City! Return flights to Ro...


In [37]:
features_ngrams_test = vectorizer_ngrams.transform(df_test["tweet"])

In [45]:
clf = LinearSVC()
clf.fit(features_ngrams, df["label"])
preds = clf.predict(features_ngrams_test)

print classification_report(df_test["label"], preds)

             precision    recall  f1-score   support

         -1       0.53      0.33      0.41       559
          0       0.60      0.74      0.66      1513
          1       0.67      0.60      0.64      1475

avg / total       0.62      0.62      0.61      3547





In [46]:
features_chars_test = vectorizer_chars.transform(df_test["tweet"])

In [47]:
clf = LinearSVC()
clf.fit(features_chars, df["label"])
preds = clf.predict(features_chars_test)

print classification_report(df_test["label"], preds)

             precision    recall  f1-score   support

         -1       0.44      0.39      0.42       559
          0       0.60      0.68      0.64      1513
          1       0.67      0.60      0.64      1475

avg / total       0.60      0.60      0.60      3547



## Model training: word embeddings

### *actual embedding generation in features.py*

In [3]:
import numpy as np

with np.load('emb_train.npz') as f:
    emb_train = f['arr_0']

with np.load('emb_test.npz') as f:
    emb_test = f['arr_0']

emb_train.shape, emb_test.shape

((9684, 300), (3547, 300))

In [18]:
clf = LinearSVC()
clf.fit(emb_train, df["label"])
preds = clf.predict(emb_test)

print classification_report(df_test["label"], preds)

             precision    recall  f1-score   support

         -1       0.65      0.34      0.45       559
          0       0.60      0.83      0.70      1513
          1       0.73      0.58      0.65      1475

avg / total       0.66      0.65      0.64      3547



In [4]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn import svm

parameter_candidates = [
  {'C': [1, 10, 100, 1000], 'kernel': ['linear']}
  #{'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']},
]
clf = GridSearchCV(estimator=svm.SVC(), param_grid=parameter_candidates, n_jobs=-1, verbose=10)
clf.fit(emb_train, df["label"])

# View the accuracy score
print('Best score:', clf.best_score_)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV] kernel=linear, C=1 ..............................................
[CV] kernel=linear, C=1 ..............................................
[CV] kernel=linear, C=1 ..............................................
[CV] kernel=linear, C=10 .............................................
[CV] .............. kernel=linear, C=10, score=0.646640, total= 1.4min
[CV] kernel=linear, C=10 .............................................
[CV] ............... kernel=linear, C=1, score=0.654382, total= 1.4min
[CV] kernel=linear, C=10 .............................................
[CV] ............... kernel=linear, C=1, score=0.648280, total= 1.4min
[CV] kernel=linear, C=100 ............................................
[CV] ............... kernel=linear, C=1, score=0.650248, total= 1.4min
[CV] kernel=linear, C=100 ............................................
[CV] .............. kernel=linear, C=10, score=0.639293, total=  57.1s
[CV] kernel=linea

[Parallel(n_jobs=-1)]: Done   5 tasks      | elapsed:  3.4min


[CV] ............. kernel=linear, C=100, score=0.640136, total= 1.7min
[CV] kernel=linear, C=1000 ...........................................


[Parallel(n_jobs=-1)]: Done   7 out of  12 | elapsed:  3.9min remaining:  2.8min


[CV] ............. kernel=linear, C=100, score=0.638786, total= 1.7min
[CV] kernel=linear, C=1000 ...........................................
[CV] ............. kernel=linear, C=100, score=0.633716, total= 1.3min


[Parallel(n_jobs=-1)]: Done   9 out of  12 | elapsed:  4.9min remaining:  1.6min


[CV] ............ kernel=linear, C=1000, score=0.639517, total= 5.3min
[CV] ............ kernel=linear, C=1000, score=0.639095, total= 5.3min
[CV] ............ kernel=linear, C=1000, score=0.631856, total= 5.2min


[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:  9.5min finished


('Best score for data1:', 0.65097067327550595)


In [5]:
clf.best_estimator_

SVC(C=1, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape=None, degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [8]:
preds = clf.predict(emb_test)

print classification_report(df_test["label"], preds)

             precision    recall  f1-score   support

         -1       0.64      0.32      0.42       559
          0       0.59      0.84      0.69      1513
          1       0.74      0.56      0.64      1475

avg / total       0.66      0.64      0.63      3547



In [None]:
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn import svm

parameter_candidates = [
  #{'C': [1, 10, 100, 1000], 'kernel': ['linear']}
  {'C': [1, 10, 100, 1000], 'gamma': [0.001, 0.0001], 'kernel': ['rbf']}
]
clf = GridSearchCV(estimator=svm.SVC(), param_grid=parameter_candidates, n_jobs=-1, verbose=10)
clf.fit(emb_train, df["label"])

# View the accuracy score
print('Best score:', clf.best_score_)

preds = clf.predict(emb_test)

print classification_report(df_test["label"], preds)

Fitting 3 folds for each of 8 candidates, totalling 24 fits
[CV] kernel=rbf, C=1, gamma=0.001 ....................................
[CV] kernel=rbf, C=1, gamma=0.001 ....................................
[CV] kernel=rbf, C=1, gamma=0.001 ....................................
[CV] kernel=rbf, C=1, gamma=0.0001 ...................................
