In [35]:
## import data processing/cleaning , data modeling libraries

import pandas as pd
import os
import sys
import re as re
import datetime as datetime
import numpy as np
import collections
import string

from sklearn.decomposition import PCA
from sklearn.tree import DecisionTreeRegressor
from nltk.tag.perceptron import PerceptronTagger

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cross_validation import train_test_split
    
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

#from gensim.models import doc2vec
from gensim.models import Doc2Vec
from gensim.models import Word2Vec
from gensim.models import fasttext

import warnings
warnings.filterwarnings('ignore')
t0 = datetime.datetime.now()

/home/ubuntu/Notebook/adviewRecomm/version3/temp/


In [36]:
outData43 = pd.read_csv("reviews.txt",encoding="utf-8",header=None,names=["text"])
print(outData43.shape)
print(outData43.head(n=2)) # 39:

(25000, 1)
                                                text
0  bromwell high is a cartoon comedy . it ran at ...
1  story of a man who has unnatural feelings for ...


In [37]:
## NLP PRE-PROCESSING

#text=" the JanaSena Party Formation Day celebrations || LIVE funny celebration|| Pawan Kalyan || Guntur"
text="Chak De India | Full Title Song | Shah Rukh Khan | Sukhvinder Singh | Salim | Marianne D'Cruz"

def nltk_clean_sent(line):
    if len(line)>0:
        ## remove the punctuation/emoticons/digits/multispaces with single from the line
        ## dont make lowercase before the pos tagging
        line_lower = line.strip()
        line_punct = re.sub('['+string.punctuation+']',' ',line_lower)
        line_emots = re.sub(r'[\u200b-\u2fff]+',' ',line_punct)
        line_digis = re.sub(r'[0-9]+',' ',line_emots)
        line_spaces = re.sub(r'[\s]+',' ',line_digis)
        line = line_spaces
    return line

def nltk_extract_postags(line):
    cleaned_str = ''
    tokens = nltk.word_tokenize(line)
    tokens_pos = PerceptronTagger().tag(tokens)
    #print(tokens_pos)
    # noun tags
    nn_tags = ['NN', 'NNP', 'NNP', 'NNPS', 'NNS']
    # adjectives
    jj_tags = ['JJ', 'JJR', 'JJS']
    # verbs
    vb_tags = ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']
    nltk_tags = nn_tags + jj_tags + vb_tags
    
    for tag_word in tokens_pos:
        if tag_word[1] in nltk_tags:
            if len(tag_word[0])>2:
                cleaned_str += tag_word[0]+' '
    return cleaned_str.strip().lower()

def nltk_apply_lemma(line):
    tokens_lemmas = [WordNetLemmatizer().lemmatize(word) for word in line.split()]
    ## stemming
    tokens_stops = [word for word in tokens_lemmas if word not in stopwords.words('english')]
    tokens_stops = [word for word in tokens_stops if len(word.strip())>2]
    tokens_stops = list(set(tokens_stops))
    return tokens_stops

def nltk_extract_tags(line):
        ## tokenize the sentence/get tokens that contains only letters
        line_clean = nltk_clean_sent(line)
        ## apply postags to the words and get only couple of tags and word length >2
        tokens_pos = nltk_extract_postags(line_clean)
        ## apply lemmatize/stemming and remove stopwords
        token_lemma = nltk_apply_lemma(tokens_pos)
        return token_lemma
    
nltk_extract_tags(text)

['khan',
 'shah',
 'marianne',
 'song',
 'chak',
 'rukh',
 'salim',
 'cruz',
 'sukhvinder',
 'title',
 'india',
 'full',
 'singh']

In [38]:
# TF-IDF transformation
def getTfIdfMetrics(outData44):
    t0 = datetime.datetime.now()
    tfidf_transform = TfidfVectorizer(tokenizer=nltk_extract_tags,min_df=3,max_df=0.95,stop_words='english',use_idf=True,ngram_range=(1,1))
    #terms = tfidf_transform.get_feature_names()
    #print(len(terms))
    #print(terms[:100])
    tfidf_vecto = tfidf_transform.fit_transform(outData44["text"])
    print(tfidf_vecto.shape)
    # got shape of (83806, 149542) - without translation
    t1 = datetime.datetime.now()
    print("END TIME after TFIDF =",t1)
    print("time taken until TFIDF=",(t1-t0))
    return tfidf_vecto

## PRINTING FIRST 100 FEATURES
#getTfIdfMetrics(outData44)
#terms = tfidf_transform.get_feature_names()
#print(terms[:100])

In [47]:
# ## DOC2VEC model
# #outData44_en_sample = outData44_en[50:55]

# cleaneddoc = [nltk_extract_tags(text) for text in list(outData44_en["v_title_trans"])]
# taggeddoc = [doc2vec.TaggedDocument(val,[idx]) for idx,val in enumerate(cleaneddoc)]

# d2vmodel = Doc2Vec(size=100,min_count=2,alpha=0.025,min_alpha=0.025)
# d2vmodel.build_vocab(taggeddoc)
# #print(d2vmodel[0])
# d2vmodel.train(taggeddoc,total_examples=d2vmodel.corpus_count,epochs=10,start_alpha=0.002,end_alpha=-0.016)
# d2vmodel_vecs = [d2vmodel.infer_vector(val) for idx,val in enumerate(cleaneddoc)]
# #print(d2vmodel_vecs[0])

##feature_vecto = d2vmodel.docvecs.doctag_syn0
##print(len(d2vmodel_vecs))

## WORD2VEC model # Word2Vec
def getWord2VecMetrics(outData44):
    t0 = datetime.datetime.now()
    cleaneddoc = [nltk_extract_tags(text) for text in list(outData44["text"])]
    cleaneddoc = [text for text in cleaneddoc if len(text)>0]
    w2vmodel = Word2Vec(cleaneddoc,min_count=2,size=300)
    w2v_vecs = w2vmodel.wv.syn0
    t1 = datetime.datetime.now()
    print("END TIME after word2vec =",t1)
    print("time taken until word2vec=",(t1-t0))
    return w2v_vecs
#feature_vecto = getWord2VecMetrics(outData44,lang)
#print(feature_vecto.shape)

In [48]:
## MODEL FUNCTION DEFINITION
#feature_vecto = np.stack(d2vmodel.docvecs)
#def getModelDataTfIdf(outData5,lang,feature_vecto):

In [49]:
startT = datetime.datetime.now()

## feature engineering
#feature_vecto = getTfIdfMetrics(outData43)
feature_vecto = getWord2VecMetrics(outData43)
print(feature_vecto.shape)
## Model Data Function Call
#clustData2 = getModelDataTfIdf(outData5,lang,feature_vecto)
#print(clustData2.shape)
endT = datetime.datetime.now()
print("TIME taken for Modeling=",(endT-startT))

END TIME after word2vec = 2018-07-07 23:23:40.780616
time taken until word2vec= 0:23:51.622409
(37007, 200)
TIME taken for Modeling= 0:23:51.724596


In [42]:
# ERROR : ValueError: max_df corresponds to < documents than min_df - min_df=6,max_df=0.9 - pass dataframe column when transforming tfidf

# #min_count=2,size=200
# END TIME after word2vec = 2018-07-07 23:23:40.780616
# time taken until word2vec= 0:23:51.622409
# (37007, 200)
# TIME taken for Modeling= 0:23:51.724596

#min_count=1,size=200
# END TIME after word2vec = 2018-07-07 22:48:00.570493
# time taken until word2vec= 0:24:20.255714
# (63044, 200)
# TIME taken for Modeling= 0:24:20.365022

# min_df=3,max_df=0.95
# (25000, 29447)
# END TIME after TFIDF = 2018-07-07 22:08:19.848881
# time taken until TFIDF= 0:22:22.619855
# (25000, 29447)
# TIME taken for Modeling= 0:22:22.635482
        
#     from sklearn.cluster import k_means_
#     from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances