
# Note book for the project 2 

Kaggle competition link: [Submition]('https://www.kaggle.com/c/epfml17-text/submit')

## Pipeline: 


### Create cooc matrix

1. sh build_vocab.sh
2. sh cut_vocab.sh
3. python3 pickle_vocab.py
4. python3 cooc.py

Now given the co-occurrence matrix and the vocabulary, it is not hard to train GloVe word embeddings, that is to compute an embedding vector for wach word in the vocabulary. We suggest to implement SGD updates to train the matrix factorization, as in

5. python3 glove_template.py

Once you tested your system on the small set of 10% of all tweets, we suggest you run on the full datasets pos_train_full.txt, neg_train_full.txt

### Building a Text Classifier:

1. Construct Features for the Training Texts: Load the training tweets and the built GloVe word embeddings. Using the word embeddings, construct a feature representation of each training tweet (by averaging the word vectors over all words of the tweet).

2. Train a Linear Classifier: Train a linear classifier (e.g. logistic regression or SVM) on your constructed features, using the scikit learn library, or your own code from the earlier labs. Recall that the labels indicate if a tweet used to contain a 🙂 or 🙁 smiley.

3. Prediction: Predict labels for all tweets in the test set.

4. Submission / Evaluation: Submit your predictions to kaggle, and verify the obtained misclassification error score. (You can also use a local separate validation set to get faster feedback on the accuracy of your system). Try to tune your system for best evaluation score.

### Extensions:
Naturally, there are many ways to improve your solution, both in terms of accuracy and computation speed. More advanced techniques can be found in the recent literature.

- find algorithm for equivalent words and rewrite tweets
- parallelize feature construction
- consider tuple with GloVe
- try feature construction alternatives
- improve ratio (Done by stiing lower bound lb on somme)



Importing usefull library

In [None]:
import numpy as np
import pandas as pd
from split_data import split_data
%matplotlib inline
#!/usr/bin/env python3
from scipy.sparse import *
from sklearn import linear_model, preprocessing, neural_network
import numpy as np
import pickle
import random
import csv

# 0.Variables


In [None]:

#Define filenames and variables

#embeddings
embeddings_ts =      'embeddings_their_GloVe.npy'
embeddings_te =      'embeddings_te.npy'
embeddings_ts_full = 'embeddings_ts_full.npy'
embeddings_te_full = 'embeddings_te_full.npy'

#tweets
pos_ts_tweets =      'train_pos.txt'
neg_ts_tweets =      'train_neg.txt'
pos_ts_full_tweets = 'train_pos_full.txt'
neg_ts_full_tweets = 'train_neg_full.txt'
te_full_tweets =     'test_data.txt'

#vocab
file_vocab = 'vocab.pkl'

#coocurrence matrices
cooc_full = 'ccoc_full.pkl'
cooc_partial = 'cooc.pkl'


#Features variables
#pertinence = see construct_features.py
nb_dim = 20



# Helper functions


In [None]:
def GloVe(file_name=cooc_partial, destination=embeddings_ts):
    #load coocurence matrix
    with open(file_name, 'rb') as f:
        cooc = pickle.load(f)    
    
    nmax = 100
    embedding_dim = 20
    eta = 0.001
    alpha = 3 / 4
    epochs = 3
    
    xs = np.random.normal(size=(cooc.shape[0], embedding_dim))
    ys = np.random.normal(size=(cooc.shape[1], embedding_dim))
   
    #Construct vector representations xs for words
    for epoch in range(epochs):
        print("epoch {}".format(epoch))
        for ix, jy, n in zip(cooc.row, cooc.col, cooc.data):
            
            f = ((n / nmax)**alpha) if n < nmax else 1
            inter_cost = (xs[ix]@(ys[jy]) - np.log(n))
            # We compute the gradients for both context and main vector words
            grad_main = f * inter_cost * ys[jy]
            grad_context = f * inter_cost * xs[ix]
    
            # Update the vector words
            xs[ix] = xs[ix] - (eta * grad_main)
            ys[jy] = ys[jy] - (eta * grad_context)
            
    #Store xs in destination file
    np.save(file=destination, arr=xs)


def construct_features(tweets, embeddings, weights):
    features = []
    invalid_features = [];
    nb_dim = 20
    pertinence = 35

    with open('vocab.pkl', 'rb') as f :
        vocab = pickle.load(f)

    #Load words from tweet set
    xs = np.load(embeddings)
    
    for indl, line in enumerate(tweets):
        #we differentiate tweets containing pertinent words, those in dictionnary 'weights'
        sum_w_pertinent = np.zeros(nb_dim)
        sum_w_others = np.zeros(nb_dim)

        count_pertinent = 0
        count_other = 0

        for word in line.split():
            local_w = vocab.get(word, -1)
            if local_w != -1:
                weight = weights.get(word, -1)
                if weight != -1:

                    #If the word is pertinent, we add its word representation to others pertinents word's representation
                    count_pertinent += weight*pertinence
                    sum_w_pertinent += xs[local_w] * (weight*pertinence)

                else:

                    #If the word is not pertinent, we add its representation to non-pertinent words representations
                    count_other += 1
                    sum_w_others += xs[local_w]

            # If we found pertinent words, we only use them
        if(count_pertinent != 0):
            features.append(sum_w_pertinent/count_pertinent)

            #if we found only non-pertinent words, we use them anyway
        elif count_other!= 0:
            features.append(sum_w_others/count_other)

            #if we did not find words that have representation, we do not try to create features and signal their indices
        else:
            invalid_features.append(indl)

    invalid_features = np.array(invalid_features)
    features = np.array(features)

    return features, invalid_features


def policy_unpredictable():
    return np.random.choice((1,-1))

def assemble(valid, indices):
    cur = 0
    nb_inserted = 0
    result = [0]*(len(valid) + len(indices))
    for i in range((len(valid) + len(indices))):
        if(cur in indices):
            result[cur] = policy_unpredictable()
            cur = cur + 1
        else:
            result[cur] = valid[nb_inserted]
            cur = cur + 1
            nb_inserted = nb_inserted + 1
    return np.array(result)

def accuracy(prediction, actual_emotions):
    return (1 - (np.sum(np.abs(actual_emotions-prediction))/(2*len(actual_emotions)))) * 100

# 1.Construct words_embeddings for training set


In [None]:
GloVe()


# Classify tweets

In [None]:
def build_and_predict(pertinent_lb):    
    
    
    #define relevant_vocab file to use
    relevant_vocab = 'relevant_vocab_lb='+str(pertinent_lb)+'.txt'
    
    #load ratios into a dictionary
    new_vocab = pd.read_csv(filepath_or_buffer=relevant_vocab, sep=" ")
    weights = new_vocab[["word", "ratio"]]
    weights = dict(zip(weights.word, weights.ratio))
    
    #Split positive tweets into training and testing sets
    pos_tweets = np.array(open(pos_ts_tweets, 'r').readlines()) 
    labels_pos = np.ones(len(pos_tweets))
    pos_tr, labels_pos_tr, pos_te, labels_pos_te = split_data(pos_tweets, labels_pos, 0.996)
    
   

    #Split negative tweets into training and testing sets
    neg_tweets = np.array(open(neg_ts_tweets, 'r').readlines())
    labels_neg = np.full(len(neg_tweets), -1)
    neg_tr, label_neg_tr, neg_te, labels_neg_te = split_data(neg_tweets, labels_neg, 0.996)
            
    #Find features for each tweet with at least one word within vocab, get indices of unpredictable tweets
    #for pos tweets
    pos_tr_feat, invalid_pos_tr = construct_features(pos_tr, embeddings_ts, weights)
    pos_te_feat, invalid_pos_te = construct_features(pos_te, embeddings_ts, weights)

    #for neg tweets
    neg_tr_feat, invalid_neg_tr = construct_features(neg_tr, embeddings_ts, weights)
    neg_te_feat, invalid_neg_te = construct_features(neg_te, embeddings_ts, weights)
    
    print(len(pos_tr_feat), len(invalid_pos_tr))
    print(len(pos_te_feat), len(invalid_pos_te))
    print(len(neg_tr_feat), len(invalid_neg_tr))
    print(len(neg_te_feat), len(invalid_neg_te))
    
    #Initialize classifier and scaler
    neural = neural_network.MLPClassifier()
    scaler = preprocessing.StandardScaler()

    #fit classifier on predictable tweets
    X = np.concatenate((pos_tr_feat, neg_tr_feat))
    Y = np.concatenate((np.ones(len(pos_tr_feat)), np.full(len(neg_tr_feat), -1)))
    X = scaler.fit_transform(X, Y)
    neural = neural.fit(X, Y)

    #scale data that should be predicted
    pos_te_feat_scaled = pos_te_feat
    pos_te_feat_scaled = scaler.fit_transform(pos_te_feat_scaled, np.ones(len(pos_te_feat))) 

    neg_te_feat_scaled = neg_te_feat
    neg_te_feat_scaled = scaler.fit_transform(neg_te_feat_scaled, np.ones(len(neg_te_feat))) 

    #predict predictable tweets
    pos_prediction = neural.predict(pos_te_feat_scaled)
    neg_prediction = neural.predict(neg_te_feat_scaled)

    #merge with unpredictable tweets predictions
    pos_labels = assemble(pos_prediction, invalid_pos_te)
    neg_labels = assemble(neg_prediction, invalid_neg_te)

    #merge all predictions
    labels = np.concatenate((pos_labels, neg_labels))
    true_labels = np.concatenate((np.ones(len(pos_labels)), np.full(len(neg_labels), -1)))

    return labels, true_labels    
    

# submission procedure


In [None]:
def submit():
    #Load words from tweet set
    #xs = np.load(embeddings_ts_full)
    
    #define relevant_vocab file to use
    relevant_vocab = 'relevant_vocab_full_lb=5000.txt'
    
    #load ratios into a dictionary
    new_vocab = pd.read_csv(filepath_or_buffer=relevant_vocab, sep=" ")
    weights = new_vocab[["word", "ratio"]]
    weights = dict(zip(weights.word, weights.ratio))
    
    print(len(new_vocab))
    
    #Find features for each tweet with at least one word within vocab, get indices of unpredictable tweets
    #for training tweets
    pos_ts_full_feat, invalid_pos_ts_full = construct_features(pos_ts_full_tweets, 'embeddings_full_epoch_10.npy', weights)
    neg_ts_full_feat, invalid_neg_ts_full = construct_features(neg_ts_full_tweets, 'embeddings_full_epoch_10.npy', weights)
    
    print(len(pos_ts_full_feat), len(invalid_pos_ts_full) )
    print(len(neg_ts_full_feat), len(invalid_neg_ts_full) )
    
    #for test tweets
    te_full_feat, invalid_te_full = construct_features(te_full_tweets, 'embeddings_full_epoch_10.npy', weights)
    print(len(te_full_feat), len(invalid_te_full) )
    
     #Initialize classifier and scaler
    neural = neural_network.MLPClassifier()
    scaler = preprocessing.StandardScaler()

    #fit classifier on predictable tweets
    X = np.concatenate((pos_ts_full_feat, neg_ts_full_feat))
    Y = np.concatenate((np.ones(len(pos_ts_full_feat)), np.full(len(neg_ts_full_feat), -1)))
    X = scaler.fit_transform(X, Y)
    neural = neural.fit(X, Y)

    #scale data that should be predicted
    te_full_feat_scaled = te_full_feat
    te_full_feat_scaled = scaler.fit_transform(te_full_feat_scaled, np.ones(len(te_full_feat))) 


    #predict predictable tweets
    te_prediction = neural.predict(te_full_feat_scaled)

    #merge with unpredictable tweets predictions
    labels = assemble(te_prediction, invalid_te_full)
   
    return labels
    
    
    

# Generate data

In [None]:
for lb in [] :
    label_nn, true_labels_nn = build_and_predict(lb)
    print('accuracy ='+str(accuracy(label_nn, true_labels_nn))+' for lb = ' + str(lb))

# create submission file

In [None]:
pedict_te = submit()
with open('submission.csv', 'w',) as f2:
    fields = ('Id', 'Prediction')
    wr = csv.DictWriter(f2, fieldnames=fields, lineterminator = '\n')
    wr.writeheader()
    
    for id_tweet, prediction in enumerate(predict_te):
        wr.writerow({'Id':id_tweet+1, 'Prediction': (int)(prediction)})
        print(id_tweet)