In [1]:
import pandas as ps

import matplotlib.pylab as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.neural_network import MLPClassifier
from sklearn import linear_model
from sklearn.svm import LinearSVC

from gensim import models

import warnings 
warnings.simplefilter('ignore')

from proj2_helpers import *
from get_embeddings_ML import *
from ML_sklearn import *

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [2]:
from nltk import word_tokenize, WordNetLemmatizer

In [3]:
def get_word2vec_embeddings(vectors, clean_comments, generate_missing=False):
    embeddings = clean_comments['tokens'].apply(lambda x: get_average_word2vec(x, vectors, 
                                                                                generate_missing=generate_missing))
    return list(embeddings)

def get_average_word2vec(tokens_list, vector, generate_missing=False, k=300):
    if len(tokens_list)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

### DATA LOADING

In [4]:
RESULT_POS_PATH = './Results/pp_pos_otpl_nd.txt'
RESULT_NEG_PATH = './Results/pp_neg_otpl_nd.txt'
RES_PATH = './Results/pp_test_otpl.txt'

In [5]:
# load the data files = list with each line being a tweet
result_pos = open(RESULT_POS_PATH, "r").read().splitlines()
result_neg = open(RESULT_NEG_PATH, "r").read().splitlines()
test_set = open(RES_PATH, "r").read().splitlines()

### DATAFRAME CONSTRUCTION

In [6]:
#-----------------------------------------TRAINING SET---------------------------------------------------------------------------

# create labels
label_pos = [1] * len(result_pos)
#create a df
pos_df = pd.DataFrame(list(zip(label_pos, result_pos)),columns=["Sentiment","Tweet"]) 
del label_pos

# create labels
label_neg = [-1] * len(result_neg)
# create a df
neg_df = pd.DataFrame(list(zip(label_neg, result_neg)),columns=["Sentiment","Tweet"]) #create a df
del label_neg

# regroup the dfs, ignore index in order to get new ones (->no duplicate)
train_df = pd.concat([pos_df,neg_df],ignore_index=True) #regroup the dfs, ignore index in order to get new ones (->no duplicate)

train_tokens = [word_tokenize(sen) for sen in train_df.Tweet] 

train_df['tokens'] = train_tokens

# shuffle the rows
train_df = train_df.sample(frac=1) 

In [7]:
train_df

Unnamed: 0,Sentiment,Tweet,tokens
159233,-1,season finale waterloo road life,"[season, finale, waterloo, road, life]"
63242,1,god hear dear regard fam ffb pin pls lose,"[god, hear, dear, regard, fam, ffb, pin, pls, ..."
168215,-1,bad ground could get food,"[bad, ground, could, get, food]"
165946,-1,new look sewing pattern miss dress size miss d...,"[new, look, sewing, pattern, miss, dress, size..."
108707,-1,sorry danny lancaster tonight need,"[sorry, danny, lancaster, tonight, need]"
...,...,...,...
109339,-1,mitac mio minisync mobile charge kit high curr...,"[mitac, mio, minisync, mobile, charge, kit, hi..."
38657,1,depend quality good need one get one,"[depend, quality, good, need, one, get, one]"
164762,-1,city star war run laptop,"[city, star, war, run, laptop]"
145596,-1,gon without shah every sunday,"[gon, without, shah, every, sunday]"


In [8]:
#-----------------------------------------TEST SET---------------------------------------------------------------------------
test_ids = np.linspace(1,10000,10000, dtype=int)
# create a df
test_df = pd.DataFrame(list(zip(test_ids, test_set)), columns=["Tweet_submission_id","Tweet"]) 

test_tokens = [word_tokenize(sen) for sen in test_df.Tweet] 

test_df['tokens'] = test_tokens

### Split data into test and train

In [9]:
data_train, data_test = train_test_split(train_df, test_size=0.10, random_state=42)

In [10]:
all_training_words = [word for tokens in data_train["tokens"] for word in tokens]
training_sentence_lengths = [len(tokens) for tokens in data_train["tokens"]]
TRAINING_VOCAB = sorted(list(set(all_training_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_training_words), len(TRAINING_VOCAB)))
print("Max sentence length is %s" % max(training_sentence_lengths))

1080474 words total, with a vocabulary size of 40166
Max sentence length is 26


In [11]:
all_test_words = [word for tokens in data_test["tokens"] for word in tokens]
test_sentence_lengths = [len(tokens) for tokens in data_test["tokens"]]
TEST_VOCAB = sorted(list(set(all_test_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_test_words), len(TEST_VOCAB)))
print("Max sentence length is %s" % max(test_sentence_lengths))

119103 words total, with a vocabulary size of 13829
Max sentence length is 20


### WORD EMBEDDING

In [12]:
word2vec_path = 'GoogleNews-vectors-negative300.bin.gz'
word2vec = models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

In [13]:
training_embeddings = get_word2vec_embeddings(word2vec, data_train, generate_missing=True)

In [14]:
MAX_SEQUENCE_LENGTH = 50
EMBEDDING_DIM = 300

#### Tokenize and Pad sequences

In [15]:
tokenizer = Tokenizer(num_words=len(TRAINING_VOCAB), lower=True, char_level=False)
tokenizer.fit_on_texts(data_train["Tweet"].tolist())
training_sequences = tokenizer.texts_to_sequences(data_train["Tweet"].tolist())

train_word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(train_word_index))

Found 40165 unique tokens.


In [16]:
train_nn_data = pad_sequences(training_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [17]:
train_embedding_weights = np.zeros((len(train_word_index)+1, EMBEDDING_DIM))
for word,index in train_word_index.items():
    train_embedding_weights[index,:] = word2vec[word] if word in word2vec else np.random.rand(EMBEDDING_DIM)
print(train_embedding_weights.shape)

(40166, 300)


In [18]:
X_train = train_nn_data

In [19]:
y_train = data_train.Sentiment.values
y_test = data_test.Sentiment.values

In [20]:
test_sequences = tokenizer.texts_to_sequences(data_test["Tweet"].tolist())
test_data = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [21]:
X_test = test_data

#### PCA 

In [22]:
# create instance of StandardScaler
scaler = StandardScaler()
# fit on train set only
scaler.fit(X_train)
# apply transform to train and test
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

# create instance of PCA
pca = PCA(.95)
# fit PCA on train set only
pca.fit(X_train)
# apply on train and test 
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)

In [28]:
X_train = pad_sequences(X_train, maxlen=MAX_SEQUENCE_LENGTH)
X_test = pad_sequences(X_test, maxlen=MAX_SEQUENCE_LENGTH)

### NN

In [30]:
clf = MLPClassifier(solver='adam', alpha=1e-4, hidden_layer_sizes=(2, X_train.shape[1]), random_state=4, verbose=False, learning_rate='constant')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
compute_accuracy(y_test, y_pred)

0.5751068006003925


In [31]:
all_Test_words = [word for tokens in test_df["tokens"] for word in tokens]
Test_sentence_lengths = [len(tokens) for tokens in test_df["tokens"]]
r_TEST_VOCAB = sorted(list(set(all_Test_words)))
print("%s words total, with a vocabulary size of %s" % (len(all_Test_words), len(r_TEST_VOCAB)))
print("Max sentence length is %s" % max(Test_sentence_lengths))

68947 words total, with a vocabulary size of 9748
Max sentence length is 19


In [32]:
Test_sequences = tokenizer.texts_to_sequences(test_df["Tweet"].tolist())
Test_nn = pad_sequences(Test_sequences, maxlen=MAX_SEQUENCE_LENGTH)

In [33]:
r_y_pred = clf.predict(Test_nn)

In [34]:
len(r_y_pred)

10000

### CREATE SUBMISSION

In [35]:
test_id = test_df['Tweet_submission_id'].to_numpy()

In [36]:
len(test_id)

10000

In [37]:
create_csv_submission(test_id,r_y_pred, "./Submissions/NN_W2V_SUB.csv")