In [7]:
import pandas as pd
train = pd.read_csv('package-lock.csv')
test = pd.read_csv('test_tweets_anuFYb8.csv')

train = train.reindex(['id','tweet','label'], axis=1)
train.sample(3)

Unnamed: 0,id,tweet,label
30899,30900,operation 'surprise parents' was a great succe...,0
30050,30051,so right now! #finally going to try some #hu...,0
849,850,@user @user @user @user @user @user @user @use...,0


In [2]:
test.sample(3)

Unnamed: 0,id,tweet
766,32729,@user even the weak will be able to fight #zio...
9611,41574,no type of shows come on @ this time of night ...
15808,47771,lost and insecure. #love #beautiful #gorgeous...


In [8]:
print(train.shape, test.shape)

(31962, 3) (17197, 2)


In [12]:
X = train.iloc[:, :-1].values
y = train.iloc[:, 2].values

from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

print("X_train: " + str(X_train) + "\n")
print("X_test: " + str(X_test) + "\n")
print("y_train: " + str(y_train) + "\n")
print("y_test: " + str(y_test) + "\n")


X_train: [[13487
  'how could you all ever lose your faith in me....   #pain #mistrust #relationship #outcast  #twitter #life #love #friendship #cruel']
 [3936 'my name often times auto corrects to leukemia  ']
 [26592
  "suddenly staing to feel real now i'm finished both jobs ð\x9f\x98»  "]
 ..., 
 [9846
  'another #melbourne snap, this guy played the most beautiful sounding instrument!   #streetphotography ']
 [10800 '@user thanks for the retweet :)  ']
 [2733
  " @user .@user kicks off today! check out the full list of guests we're   to see this weekend! "]]

X_test: [[13668 'i am thankful for sunshine.#thankful #positive   ']
 [22091
  "up late....i am tired but i can't sleep. my eyes are swollen from crying. my brothers and sisters ð\x9f\x91\xadð\x9f\x91¬ð\x9f\x8c\x88ð\x9f\x91¼ð\x9f\x99\x8fð\x9f\x98¢   #prayfoheworld #pulse #help"]
 [21398 'series finale of house of lies tonight.  ']
 ..., 
 [20628
  ' @user pls  #norfolkhour the eaaa norfolk polo festival stas tomorrow!  !  ']
 [

In [14]:
from nltk.tokenize import TweetTokenizer

tokenizer = TweetTokenizer()

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(ngram_range=(1, 2), tokenizer=tokenizer.tokenize)

In [17]:
vectorizer

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<bound method TweetTokenizer.tokenize of <nltk.tokenize.casual.TweetTokenizer object at 0x10d140828>>,
        use_idf=True, vocabulary=None)

In [18]:
full_text = list(train['tweet'].values) + list(test['tweet'].values)
vectorizer.fit(full_text)
train_vectorized = vectorizer.transform(train['tweet'])
test_vectorized = vectorizer.transform(test['tweet'])

In [31]:
%%time
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
logreg = LogisticRegression()
ovr = OneVsRestClassifier(logreg)

ovr.fit(train_vectorized, y)

CPU times: user 693 ms, sys: 18.9 ms, total: 712 ms
Wall time: 722 ms


In [33]:
import numpy as np
from sklearn.model_selection import cross_val_score
scores = cross_val_score(ovr, train_vectorized, y, scoring='accuracy', n_jobs=-1, cv=3)
print('Cross-validation mean accuracy {0:.2f}%, std {1:.2f}.'.format(np.mean(scores) * 100, np.std(scores) * 100))

Cross-validation mean accuracy 94.01%, std 0.06.


In [34]:
%%time
from sklearn.svm import LinearSVC
svc = LinearSVC(dual=False)
scores = cross_val_score(svc, train_vectorized, y, scoring='accuracy', n_jobs=-1, cv=3)
print('Cross-validation mean accuracy {0:.2f}%, std {1:.2f}.'.format(np.mean(scores) * 100, np.std(scores) * 100))

Cross-validation mean accuracy 95.77%, std 0.10.
CPU times: user 167 ms, sys: 71.6 ms, total: 239 ms
Wall time: 2.92 s


In [40]:
ovr.fit(train_vectorized, y);
svc.fit(train_vectorized, y);

In [41]:
predictions = svc.predict(test_vectorized)

In [36]:
test['label'] = predictions
test.sample(3)

Unnamed: 0,id,tweet,label
140,32103,@user all together this christmas: pls &amp; ...,0
7373,39336,i am thankful for my home. #thankful #positive...,0
5019,36982,ex hates seeing me doin well tuff ur not bring...,0


In [42]:
test.to_csv('submission.csv', index=False)

## Deep Learning

In [51]:
def format_data(train, test, max_features, maxlen):

    from keras.preprocessing.text import Tokenizer
    from keras.preprocessing.sequence import pad_sequences
    from keras.utils import to_categorical
    
    train = train.sample(frac=1).reset_index(drop=True)
    train['tweet'] = train['tweet'].apply(lambda x: x.lower())
    test['tweet'] = test['tweet'].apply(lambda x: x.lower())

    X = train['tweet']
    test_X = test['tweet']
    Y = to_categorical(train['label'].values)

    tokenizer = Tokenizer(num_words=max_features)
    tokenizer.fit_on_texts(list(X))

    X = tokenizer.texts_to_sequences(X)
    X = pad_sequences(X, maxlen=maxlen)
    test_X = tokenizer.texts_to_sequences(test_X)
    test_X = pad_sequences(test_X, maxlen=maxlen)

    return X, Y, test_X

In [52]:
maxlen = 125
max_features = 10000

X, Y, test_X = format_data(train, test, max_features, maxlen)

In [54]:
seed = 0
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.25, random_state=seed)


In [59]:
from keras.layers import Input, Dense, Embedding, Flatten
from keras.layers import SpatialDropout1D
from keras.layers.convolutional import Conv1D, MaxPooling1D
from keras.models import Sequential
model = Sequential()

# Input / Embdedding
model.add(Embedding(max_features, 150, input_length=maxlen))

# CNN
model.add(SpatialDropout1D(0.2))

model.add(Conv1D(32, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))

model.add(Conv1D(64, kernel_size=3, padding='same', activation='relu'))
model.add(MaxPooling1D(pool_size=2))

model.add(Flatten())

# Output layer
model.add(Dense(2, activation='sigmoid'))

In [60]:
epochs = 5
batch_size = 32

In [None]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X_train, Y_train, validation_data=(X_val, Y_val), epochs=epochs, batch_size=batch_size, verbose=1)

Train on 23971 samples, validate on 7991 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5

In [63]:
test['label'] = model.predict_classes(test_X, batch_size=batch_size, verbose=1)
test.to_csv('sub_cnn.csv', index=False)

