## Preprocessing

In [2]:
import pandas as pd

df = pd.read_csv('data/twitter-2013train-A.txt', sep='\t', header=-1, encoding='utf-8')
df.columns = ['id', 'label', 'tweet']
df["label"].apply(lambda x: x.strip())
df['label'] = df['label'].map({'positive': 1, 'negative': -1, 'neutral': 0})
df

Unnamed: 0,id,label,tweet
0,264183816548130816,1,Gas by my house hit $3.39!!!! I\u2019m going t...
1,263405084770172928,-1,Theo Walcott is still shit\u002c watch Rafa an...
2,262163168678248449,-1,its not that I\u2019m a GSP fan\u002c i just h...
3,264249301910310912,-1,Iranian general says Israel\u2019s Iron Dome c...
4,262682041215234048,0,Tehran\u002c Mon Amour: Obama Tried to Establi...
5,264229576773861376,0,I sat through this whole movie just for Harry ...
6,264105751826538497,1,with J Davlar 11th. Main rivals are team Polan...
7,264094586689953794,-1,Talking about ACT\u2019s && SAT\u2019s\u002c d...
8,212392538055778304,0,"Why is \""""Happy Valentines Day\"""" trending? It..."
9,254941790757601280,-1,They may have a SuperBowl in Dallas\u002c but ...


In [2]:
df["tweet"]

0       Gas by my house hit $3.39!!!! I\u2019m going t...
1       Theo Walcott is still shit\u002c watch Rafa an...
2       its not that I\u2019m a GSP fan\u002c i just h...
3       Iranian general says Israel\u2019s Iron Dome c...
4       Tehran\u002c Mon Amour: Obama Tried to Establi...
5       I sat through this whole movie just for Harry ...
6       with J Davlar 11th. Main rivals are team Polan...
7       Talking about ACT\u2019s && SAT\u2019s\u002c d...
8       Why is \""Happy Valentines Day\"" trending? It...
9       They may have a SuperBowl in Dallas\u002c but ...
10      Im bringing the monster load of candy tomorrow...
11      Apple software\u002c retail chiefs out in over...
12      @oluoch @victor_otti @kunjand I just watched i...
13      One of my best 8th graders Kory was excited af...
14      #Livewire Nadal confirmed for Mexican Open in ...
15      @MsSheLahY I didnt want to just pop up... but ...
16      @Alyoup005 @addicted2haley hmmmm  November is ...
17      #Iran 

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from nltk import tokenize

vectorizer_ngrams = CountVectorizer(analyzer = "word",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = "english", \
                             min_df = 3, \
                             ngram_range = (1,3), \
                             max_features = 10000)

vectorizer_chars = CountVectorizer(analyzer = "char",   \
                             tokenizer = None,    \
                             preprocessor = None, \
                             stop_words = "english", \
                             min_df = 3, \
                             ngram_range = (2,5), \
                             max_features = 10000)

features_ngrams = vectorizer_ngrams.fit_transform(df["tweet"])
features_chars = vectorizer_chars.fit_transform(df["tweet"])

In [4]:
vocab_ngrams = vectorizer_ngrams.get_feature_names()
vocab_chars = vectorizer_chars.get_feature_names()
print len(vocab_ngrams)
print len(vocab_chars)

print vocab_ngrams[:100]
print vocab_chars[:100]
# print vocab_ngrams

9888
10000
[u'00', u'00 pm', u'00 u002c', u'000', u'00am', u'00pm', u'01', u'02', u'03', u'039', u'04', u'05', u'06', u'07', u'08', u'09', u'10', u'10 11', u'10 17', u'10 18', u'10 2011', u'10 27', u'10 30', u'10 30am', u'10 http', u'10 million', u'10 years', u'100', u'1000', u'101', u'105', u'106', u'106 park', u'10am', u'10pm', u'10pm itv2', u'10pm itv2 hurrraaaaah', u'10th', u'10th annual', u'10th birthday', u'10th grade', u'10th kansas', u'10th kansas nationwide', u'10th time', u'10th time season', u'11', u'11 02', u'11 12', u'11 30', u'11 http', u'11am', u'11pm', u'11pm iparty', u'11th', u'11th 2012', u'11th international', u'11th international cloud', u'12', u'12 u002c', u'120', u'12pm', u'12th', u'13', u'134', u'13th', u'13th desperation', u'13th desperation day', u'14', u'14 u002c', u'14th', u'15', u'15 2012', u'15 http', u'15 places', u'15 places 16', u'150', u'15pm', u'15th', u'15th august', u'15th august anthem', u'16', u'16 august', u'16 august isk', u'16 official', u'16 of

In [5]:
print features_ngrams[0], df["tweet"][0]

  (0, 1614)	1
  (0, 8985)	1
  (0, 7153)	1
  (0, 3805)	1
  (0, 1613)	1
  (0, 3404)	1
  (0, 8976)	1
  (0, 281)	1
  (0, 3813)	1
  (0, 3907)	1
  (0, 3316)	1 Gas by my house hit $3.39!!!! I\u2019m going to Chapel Hill on Sat. :)


In [6]:
print len(filter(lambda x: len(x.split()) == 1, vocab_ngrams))
print len(filter(lambda x: len(x.split()) == 2, vocab_ngrams))
print len(filter(lambda x: len(x.split()) == 3, vocab_ngrams))
print
print len(filter(lambda x: len(x) == 2, vocab_chars))
print len(filter(lambda x: len(x) == 3, vocab_chars))
print len(filter(lambda x: len(x) == 4, vocab_chars))
print len(filter(lambda x: len(x) == 5, vocab_chars))

5194
3651
1043

799
2783
3768
2650


## Model training

In [4]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold
import time

In [None]:
start = time.time()

clf = SVC(C=100, gamma=0.1)

cvm = KFold(5, shuffle=True, random_state=3)
s = cross_val_score(clf, features_ngrams, df["label"], cv=cvm, scoring='f1_macro')
print s.mean()

print "Time taken: {0}".format(time.time() - start)

In [5]:
start = time.time()

clf = SVC(C=100, gamma=0.1)

cvm = KFold(5, shuffle=True, random_state=3)
s = cross_val_score(clf, features_chars, df["label"], cv=cvm, scoring='f1_macro')
print s.mean()

print "Time taken: {0}".format(time.time() - start)

  'precision', 'predicted', average, warn_for)


NameError: name 'ss' is not defined