In [1]:
import string
import numpy as np
import scipy as sc
import pandas as pd
import pickle
import csv
import random
from sklearn import *
from sklearn.model_selection import *
from sklearn.preprocessing import *
from sklearn.linear_model import *
from sklearn.ensemble import *
from sklearn.metrics import *
from sklearn.externals import *
from sklearn.utils import shuffle
from sklearn.naive_bayes import *
import tensorflow as tf
from tensorflow import keras
from nltk.corpus import stopwords
from nltk import *
from textblob import TextBlob, Word
import subprocess
from tensorflow.python.keras.preprocessing import sequence
from tensorflow.python.keras.preprocessing import text

In [2]:
SUBSET_POS = 'train_pos.txt'
SUBSET_NEG = 'train_neg.txt'

DATA_POS = 'train_pos_full.txt'
DATA_NEG = 'train_neg_full.txt'

CLEAN_DATA_POS = 'train_pos_full_clean.txt'
CLEAN_DATA_NEG = 'train_neg_full_clean.txt'
CLEAN_TEST = 'test_data_clean.txt'

TEST_DATA = 'test_data.txt'

In [3]:
# Loading data
pos = pd.read_csv(DATA_POS, header=None, delimiter="\n", names=["tweets"])
neg = pd.read_csv(DATA_NEG, header=None, delimiter="\n", names=["tweets"])
test = pd.read_csv(TEST_DATA, header=None, delimiter="\n", names=["tweets"])
voc = pd.read_csv('vocab_cut.txt', header=None)

In [5]:
pos.head()

Unnamed: 0,0
0,<user> i dunno justin read my mention or not ....
1,"because your logic is so dumb , i won't even c..."
2,""" <user> just put casper in a box ! "" looved t..."
3,<user> <user> thanks sir > > don't trip lil ma...
4,visiting my brother tmr is the bestest birthda...


In [6]:
pos.shape

(1250000, 1)

In [7]:
neg.head()

Unnamed: 0,0
0,vinco tresorpack 6 ( difficulty 10 of 10 objec...
1,glad i dot have taks tomorrow ! ! #thankful #s...
2,1-3 vs celtics in the regular season = were fu...
3,<user> i could actually kill that girl i'm so ...
4,<user> <user> <user> i find that very hard to ...


In [8]:
neg.shape

(1250000, 1)

In [371]:
voc.shape

(64476, 1)

# Data preprocessing

In [4]:
def clean_data(file, drop_dup = True):
    
    data = pd.read_csv(file, header=None, delimiter="\n", names=["tweet"])
    
    if drop_dup:
        # Removing duplicates
        data.drop_duplicates(inplace=True)
    
    # Put everything to lower case (should be already done but just to be safe)
    data['tweet'] = data['tweet'].apply(lambda x: " ".join(x.lower() for x in x.split()))
    
    # Removing user tags or other html stuff
    data['tweet'] = data['tweet'].str.replace('<.*?>','')
    
    # Removing possible mentions or urls (don't know if it's necessary but might be) 
    data['tweet'] = data['tweet'].str.replace('@\w+','')
    data['tweet'] = data['tweet'].str.replace('http.?://[^\s]+[\s]?','')
    
    # Removing punctuation and symbols
    data['tweet'] = data['tweet'].str.replace('[^\w\s]', '')
    data['tweet'] = data['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in string.punctuation))
    
    # Removing non alphabetical character
    data['tweet'] = data['tweet'].apply(lambda x: " ".join(x for x in x.split() if x.isalpha()))
    
    # Removing characters non longer than 1
    data['tweet'] = data['tweet'].apply(lambda x: " ".join(x for x in x.split() if len(x) > 1))
    
    # Removing stopwords
    sw = stopwords.words('english')
    data['tweet'] = data['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in sw))
    
    # Removing digits (CAN ALSO BE CONVERTED TO WORDS)
    data['tweet'] = data['tweet'].apply(lambda x: " ".join(x for x in x.split() if not x.isdigit()))
    
    # Removing words that appear less than 5
    word_freq = pd.Series(' '.join(data['tweet']).split()).value_counts()
    less_freq = word_freq[word_freq < 5]
    data['tweet'] = data['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in less_freq))
    
    # Removing multiple spaces
    data['tweet'] = data['tweet'].apply(lambda x: x.strip())
    data['tweet'] = data['tweet'].str.replace(' +',' ')

    # Lemmatization (better than stemmatization imho)
    data['tweet'] = data['tweet'].apply(lambda x: " ".join([Word(w).lemmatize() for w in x.split()]))
    
    if drop_dup:
        # Removing duplicates again
        data.drop_duplicates(inplace=True)
    
    return data

In [424]:
# Spelling correction (taking too much time)
#pos_first_clean['tweet'] = pos_first_clean['tweet'].apply(lambda x: str(TextBlob(x).correct()))

In [425]:
#neg_first_clean['tweet'] = neg_first_clean['tweet'].apply(lambda x: str(TextBlob(x).correct()))

In [5]:
def save_data(data, file_name):
    
    data.to_csv(file_name, header=False, index=False, sep=" ")
    subprocess.call(["sed -i 's/\"//g' " + file_name], shell=True)

In [435]:
clean_pos = clean_data(DATA_POS)

In [436]:
clean_neg = clean_data(DATA_NEG)

In [461]:
clean_test = clean_data(TEST_DATA, drop_dup = False)

In [463]:
save_data(clean_pos, CLEAN_DATA_POS)
save_data(clean_neg, CLEAN_DATA_NEG)
save_data(clean_test, CLEAN_TEST)

# Data Exploaration (TO FINISH)

Let's follow this approach: choosing the model according to the ratio number of samples / number of words per sample

In [None]:
# Number of tweets

In [89]:
train_data.shape[0]

2000000

In [88]:
# Median of the number of words per tweet
num_words = pd.DataFrame(train_data)[0].str.split().str.len()
np.median(num_words)

15.0

This suggests the use of sequence of vectors instead of n-grams and probably CNNs are suitable for this task 

In [95]:
# # Vocabulary with top 20000 words
# tokenizer = text.Tokenizer(num_words=20000)
# tokenizer.fit_on_texts(train_data.tolist())

# # Vectorize train and test data
# x_train = tokenizer.texts_to_sequences(train_data.tolist())
# x_val = tokenizer.texts_to_sequences(test_data.tolist())

# # Add padding for sequences
# max_length = len(max(x_train, key=len))
# x_train = sequence.pad_sequences(x_train, maxlen=max_length)
# x_val = sequence.pad_sequences(x_val, maxlen=max_length)

# Training some models

In [143]:
# # Splitting data (done manually for preserving the same number of positive and negative tweets)
# def split_data(pos, neg):

#     pos_train, pos_test, neg_train, neg_test = train_test_split(pos, neg, test_size=0.2)

#     train_data = np.concatenate((pos_train, neg_train))
#     test_data = np.concatenate((pos_test, neg_test))

#     train_labels = np.concatenate((np.ones(pos_train.shape[0]), np.zeros(neg_train.shape[0])))
#     test_labels = np.concatenate((np.ones(pos_test.shape[0]), np.zeros(neg_test.shape[0])))

#     train_data, train_labels = shuffle(train_data, train_labels, random_state=0)
#     test_data, test_labels = shuffle(test_data, test_labels, random_state=0)
    
#     return train_data, train_labels, test_data, test_labels

In [None]:
# Using NLTK classifiers (TODO)

In [6]:
# Loading data
pos_data_clean = pd.read_csv(CLEAN_DATA_POS, header=None, delimiter="\n", names=["tweets"])
neg_data_clean = pd.read_csv(CLEAN_DATA_NEG, header=None, delimiter="\n", names=["tweets"])
test_data_clean = pd.read_csv(CLEAN_TEST, header=None, delimiter="\n", names=["tweets"])

In [27]:
# Loading embeddings created before
embeddings = np.load("embeddings.npy")

In [36]:
# Building features from text
def getFeatures(fileName):
    feat_repr = []
    with open('vocab.pkl', 'rb') as f:
        vocab = pickle.load(f)
        with open(fileName) as file:
            for line in file:
                tokens = [vocab.get(t, -1) for t in line.strip().split()]
                #tokens = [t for t in tokens if t >= 0]
                if (len(tokens) == 0):
                    tokens = [-1]
                embed_sum = np.zeros(embeddings.shape[1])
                for t in tokens:
                    embed_sum = np.sum([embed_sum, embeddings[t]], axis=0)
                feat_repr.append(embed_sum/len(tokens))
    return feat_repr

In [23]:
def create_csv_submission(ids, y_pred, name):
    """
    Creates an output file in csv format for submission to kaggle
    Arguments: ids (event ids associated with each prediction)
               y_pred (predicted class labels)
               name (string name of .csv output file to be created)
    """
    with open(name, 'w') as csvfile:
        fieldnames = ['Id', 'Prediction']
        writer = csv.DictWriter(csvfile, delimiter=",", fieldnames=fieldnames)
        writer.writeheader()
        for r1, r2 in zip(ids, y_pred):
            writer.writerow({'Id':int(r1),'Prediction':int(r2)})

In [37]:
pos_feat = getFeatures(CLEAN_DATA_POS)
neg_feat = getFeatures(CLEAN_DATA_NEG)
test_feat = getFeatures(CLEAN_TEST)

In [38]:
# Splitting data and get labels

pos_clean = np.array(pos_feat)
neg_clean = np.array(neg_feat)
test_clean = np.array(test_feat)

random.seed(123)
random.shuffle(pos_clean)
random.seed(123)
random.shuffle(neg_clean)

X_tr = np.concatenate((pos_clean, neg_clean))
y_pos = np.ones(pos_clean.shape[0])
y_neg = np.zeros(neg_clean.shape[0])
y_tr = np.concatenate((y_pos, y_neg))

X_tr, y_tr = shuffle(X_tr, y_tr)

X_te = test_clean

# X_tr, X_te, y_tr, y_te = train_test_split(X_tr, y_tr)

In [39]:
X_te.shape

(10000, 20)

In [None]:
# X_tr, X_te, y_tr, y_te = split_data(pos_feat, neg_feat)

In [13]:
# pos_feat_df = pd.DataFrame(np.stack(pos_feat))
# neg_feat_df = pd.DataFrame(np.stack(neg_feat))

In [16]:
# # Label
# pos_feat_df['label'] = 1
# neg_feat_df['label'] = -1

In [336]:
# dataset = np.concatenate((pos_feat_df, neg_feat_df))
# dataset

In [18]:
# np.save("dataset", dataset)

In [19]:
# dataset = np.load("dataset.npy")

In [20]:
# # Splitting data
# X = pd.DataFrame(dataset).iloc[:,:20].values
# y = pd.DataFrame(dataset).iloc[:,20].values

# X_tr, X_te, y_tr, y_te = train_test_split(X, y)

In [40]:
# Standardizing
std_scaler = StandardScaler()
X_tr = std_scaler.fit_transform(X_tr)
X_te = std_scaler.transform(X_te)

In [41]:
# Training with Random Forest
rforest = RandomForestClassifier()

rforest.fit(X_tr, y_tr)

y_pred = rforest.predict(X_te)



In [42]:
# Create submission
y_pred_norm = (y_pred * 2)-1
create_csv_submission(range(1,10001), y_pred_norm, "rand_for.csv")

In [43]:
y_pred_norm

array([-1.,  1.,  1., ...,  1., -1., -1.])

In [345]:
# Accuracy
accuracy_score(y_te, y_pred)

0.8613923323643496

In [364]:
# Training with Naive Bayes
bayes = GaussianNB()

bayes.fit(X_tr, y_tr)

y_pred = bayes.predict(X_te)

In [365]:
# Accuracy
accuracy_score(y_te, y_pred)

0.5677248910997817

In [357]:
# Training with Logistic Regression
logistic = LogisticRegression()

logistic.fit(X_tr, y_tr)

y_pred = logistic.predict(X_te)



In [358]:
# Accuracy
accuracy_score(y_te, y_pred)

0.5719094869098845

In [359]:
# Training with Linear SVM
svm_class = svm.LinearSVC()

svm_class.fit(X_tr, y_tr)

y_pred = svm_class.predict(X_te)

In [360]:
# Accuracy
accuracy_score(y_te, y_pred)

0.5719625012149112

# Using TensorFlow for training a neural network

In [112]:
# # Splitting data
# X = np.concatenate((np.array(pos_feat), np.array(neg_feat)))
# y_pos = np.ones(pos.shape[0])
# y_neg = np.zeros(pos.shape[0])
# y = np.concatenate((y_pos, y_neg))

# X, y = shuffle(X, y, random_state=0)

# train_data, test_data, train_labels, test_labels = train_test_split(X, y)

In [115]:
# with open('vocab.pkl', 'rb') as f:
#     vocab = pickle.load(f)
# # vocab = {k:(v+3) for k,v in vocab.items()} 
# # vocab["<PAD>"] = 0
# # vocab["<START>"] = 1
# # vocab["<UNK>"] = 2
# # vocab["<UNUSED>"] = 3

In [44]:
# Converting words in tokens

def getTokens(fileName):
    data_tok = []
    with open('vocab.pkl', 'rb') as f:
            vocab = pickle.load(f)
            with open(fileName) as file:
                for line in file:
                    tokens = [vocab.get(t, -1) for t in line.strip().split()]
                    tokens = [t for t in tokens if t >= 0]
                    data_tok.append(tokens)         
    return data_tok

In [45]:
pos_clean = getTokens(DATA_POS)
neg_clean = getTokens(DATA_NEG)
test_clean = getTokens(TEST_DATA)

In [46]:
# Splitting data and get labels

pos_clean = np.array(pos_clean)
neg_clean = np.array(neg_clean)
test_clean = np.array(test_clean)

random.seed(123)
random.shuffle(pos_clean)
random.seed(123)
random.shuffle(neg_clean)

X_tr = np.concatenate((pos_clean, neg_clean))
y_pos = np.ones(pos_clean.shape[0])
y_neg = np.zeros(neg_clean.shape[0])
y_tr = np.concatenate((y_pos, y_neg))

train_data, train_labels = shuffle(X_tr, y_tr)

test_data = test_clean

# train_data, test_data, train_labels, test_labels = train_test_split(X, y)

In [47]:
# # Creating vocabulary with tokenizer of tensorflow
# tokenizer = text.Tokenizer(num_words=20000, split=" ")
# tokenizer.fit_on_texts(train_data.tolist())

# # Vectorize train and test data
# x_train = tokenizer.texts_to_sequences(train_data.tolist())
# x_test = tokenizer.texts_to_sequences(test_data.tolist())

# # Add padding for sequences
# max_length = len(max(x_train, key=len))
# x_train = sequence.pad_sequences(x_train, maxlen=max_length)
# x_test = sequence.pad_sequences(x_test, maxlen=max_length)

In [48]:
max_length = len(max(train_data, key=len))

# Add padding for sequences
train_data = sequence.pad_sequences(train_data, maxlen=max_length)
test_data = sequence.pad_sequences(test_data, maxlen=max_length)

In [71]:
# Building simple model

with open('vocab.pkl', 'rb') as f:
    vocab = pickle.load(f)

model = keras.Sequential()
model.add(keras.layers.Embedding(len(vocab), 16))
model.add(keras.layers.GlobalAveragePooling1D())
model.add(keras.layers.Dense(16, activation=tf.nn.relu))
model.add(keras.layers.Dense(1, activation=tf.nn.sigmoid))

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      (None, None, 16)          1015760   
_________________________________________________________________
global_average_pooling1d_4 ( (None, 16)                0         
_________________________________________________________________
dense_8 (Dense)              (None, 16)                272       
_________________________________________________________________
dense_9 (Dense)              (None, 1)                 17        
Total params: 1,016,049
Trainable params: 1,016,049
Non-trainable params: 0
_________________________________________________________________


In [72]:
model.compile(optimizer=tf.train.AdamOptimizer(), loss='binary_crossentropy', metrics=['accuracy'])

In [73]:
# threshold = int(train_data.shape[0] * 0.4)
# x_val = train_data[:threshold]
# partial_x_train = train_data[threshold:]

# y_val = train_labels[:threshold]
# partial_y_train = train_labels[threshold:]

In [74]:
# Training with a neural network
history = model.fit(train_data, train_labels, epochs=50, batch_size=256)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [77]:
y_pred = model.predict(test_data)

In [78]:
y_pred[np.where(y_pred <= 0.5)] = 0
y_pred[np.where(y_pred > 0.5)] = 1
y_pred_norm = (y_pred * 2)-1
create_csv_submission(range(1,10001), y_pred_norm, "conv_net_not_clean.csv")

In [330]:
# Accuracy
results = model.evaluate(test_data, test_labels)

print(results)

[0.4269382586811089, 0.81152000338264]


In [331]:
history_dict = history.history
history_dict.keys()

dict_keys(['val_loss', 'val_acc', 'loss', 'acc'])