In [None]:
import numpy as np
import pickle

from scipy.sparse import rand
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import learning_curve
from sklearn.kernel_ridge import KernelRidge
import matplotlib.pyplot as plt
from scipy.sparse import coo_matrix

import implementations as imp
import proj1_helpers as ph
import glove as gl

from nltk import pos_tag
#nltk.download()
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from sklearn.neural_network import MLPClassifier

In [None]:
tweets_train = pickle.load(open('twitter-datasets/full_train_clean.pkl', 'rb'))

smile = pickle.load(open('twitter-datasets/train_smile.pkl', 'rb'))


In [None]:
len(tweets_train)

In [None]:
stemmer = PorterStemmer()
lemmatiser = WordNetLemmatizer()

In [None]:
#tweets_train_nlp = [ [ stemmer.stem(ke_free_lemm(word[0],word[1])) for word in pos_tag(split_ok (x))] \
#                    for x in tweets_train [:100] \
 #                   ]
#tweets_train_nlp = [  pos_tag(x[:-2].split(' ')) for x in tweets_train [:]  ]


In [None]:
tweets_train_nlp = pickle.load(open('tweets_nlp', 'rb'))

In [None]:
stopwords_stem = [ stemmer.stem(x) for x in stopwords.words('english')]
list_word_with_rep = [ y for x in tweets_train_nlp for y in x if x not in stopwords_stem]
list_word = list(set(list_word_with_rep))

In [None]:
len(list_word)

In [None]:
def create_vocab(list_word):
    vocab = dict()
    for idx, line in enumerate(list_word):
        vocab[line.strip()] = idx

    return vocab

vocab = create_vocab(list_word)

with open('vocab_nlp.pkl', 'wb') as f:
        pickle.dump(vocab, f, pickle.HIGHEST_PROTOCOL)


In [None]:
len(vocab)

In [None]:
def creat_cooc(list_word_with_rep,vocab):
    vocab_size = len(vocab)

    data, row, col = [], [], []
    counter = 1
    
    for line in list_word:
        tokens = [vocab.get(t, -1) for t in line.strip().split()]
        tokens = [t for t in tokens if t >= 0]
        for t in tokens:
            for t2 in tokens:
                data.append(1)
                row.append(t)
                col.append(t2)

        counter += 1
    cooc = coo_matrix((data, (row, col)))
    print("summing duplicates (this can take a while)")
    cooc.sum_duplicates()
    return cooc


cooc = creat_cooc(list_word,vocab)
with open('cooc_nlp.pkl', 'wb') as f:
        pickle.dump(cooc, f, pickle.HIGHEST_PROTOCOL)


In [None]:
def glove_create(cooc , k):
    print("loading cooccurrence matrix")
    
    print("{} nonzero entries".format(cooc.nnz))

    nmax = 100
    print("using nmax =", nmax, ", cooc.max() =", cooc.max())

    print("initializing embeddings")
    embedding_dim = k
    xs = np.random.normal(size=(cooc.shape[0], embedding_dim))
    ys = np.random.normal(size=(cooc.shape[1], embedding_dim))

    eta = 0.001
    alpha = 3 / 4

    epochs = 10

    for epoch in range(epochs):
        print("epoch {}".format(epoch))
        for ix, jy, n in zip(cooc.row, cooc.col, cooc.data):
            xdn = np.log(n)
            fdn = min(1.0, (n/nmax)**alpha)
            x = xs[ix, :]
            y = ys[jy, :]
            scale = 2 * eta * fdn * (xdn - np.dot(x, y))
            xs[ix, :] += scale * y
            ys[jy, :] += scale * x

    return  xs

glove = glove_create(cooc , 20)

with open('glove_nlp.pkl', 'wb') as f:
        pickle.dump(cooc, f, pickle.HIGHEST_PROTOCOL)




In [None]:
#gl.main(20)

In [None]:
def split_id_tweet (list_tweets) :
    ids = [x.split(',')[0] for x in list_tweets]
    tweets =  [','.join(x.split(',')[1:]) for x in list_tweets]
    return ids , tweets

In [None]:
tweets_train[0]

In [None]:
def to_three (idx , line , not_split ) :
    res_set= []
    res = []
    iter_on = line
    if(not_split) :
        iter_on = line.split(' ')
    
    for word in iter_on :
        change_nb = vocab.get(word)
        if change_nb != None :
            res_set += [change_nb]
    for change_nb in set(res_set) :
        res += [[1 , idx , change_nb]]
    return res
#tweets_to_sparse = [ np.array(elem) for idx , line in enumerate(tweets,0) for elem in to_three(idx , line)]

def columns ( matrix , i) :
    return [x[i] for x in matrix]


def tweet_to_matrix ( tweets , not_split ) :
    tweets_to_sparse = [ np.array(elem) for idx , line in enumerate(tweets,0) for elem in to_three(idx , line, not_split )]
    len(tweets)
    len(glove)
    sparse_tweets = coo_matrix((columns(tweets_to_sparse,0) , (columns(tweets_to_sparse,1),columns(tweets_to_sparse,2))) , shape=(len(tweets), len(glove)))
    return sparse_tweets.dot(glove)

In [None]:
#sparse_tweets = coo_matrix((columns(0) , (columns(1),columns(2))) , shape=(len(tweets), len(glove)))
#tweets_res = sparse_tweets.dot(glove)
tweets_res = tweet_to_matrix(tweets_train_nlp , False)


In [None]:
count = 0
count2 = 0
for x in tweets_res :
    count2 +=1
    if  np.count_nonzero(x) == 0 :
        count +=1
count , len(tweets_res) , count/len(tweets_res) , count2

In [None]:
del cooc
del tweets_train_nlp

clf = MLPClassifier(solver='lbfgs', alpha=1e-5, random_state=1)
clf.fit(tweets_res,smile[:2268591])

In [None]:
def ke_free_lemm (word , type_) :
    try :
        res = lemmatiser.lemmatize(word , type_)
        print(res,word ,type_)
        return 
        
    except KeyError :
        return word

def lemmatize (tweet):
    
    return [ lemmatiser.lemmatize(x) for x in tweet ]

def split_ok (tweet) :
    res = tweet[:-1].split(' ')
    res = [ x for x in res if x != '']
    return res


In [None]:
ids_test=[]
tweets_test=[]
with open('test_data.txt') as f :
    ids , tweets_test = split_id_tweet(f.readlines())
    
tweets_test_nlp = [ [ stemmer.stem(ke_free_lemm(word[0],word[1])) for word in pos_tag(split_ok (x))] \
                    for x in tweets_test  \
                    ]    

tweets_res_test = tweet_to_matrix(tweets_test_nlp,False)


In [None]:
tweets_test[:10]

In [None]:
count = 0
count2 = 0
for x in tweets_res_test :
    count2 +=1
    if  np.count_nonzero(x) == 0 :
        count +=1
count , len(tweets_res_test) , count/len(tweets_res_test) , count2

In [None]:
res = clf.predict(tweets_res_test)
#res = np.sign(res)
res[:20]

In [None]:
imp.create_csv_submission(ids,res,'results_nlp.csv')

In [None]:
count = 0
for x in res :
    if x  == 0 :
        count +=1
count , len(res) , count/len(res)

In [None]:
for i in res :
    if i not in [-1,1] :
        print(i)