In [72]:
def process_page(page, file_name):
    try:
        global tweets_response_list, df, count
        for response in page:
            hashtags = [h['text'] for h in response.entities['hashtags']]
            tweets_response_list.append([response.id_str, response.full_text, response.user.name, response.user.screen_name, response.user.location, 
                                         response.user.created_at, response._json['retweet_count'], response._json['favorite_count'], hashtags])
            df = pd.DataFrame(tweets_response_list, columns = ['id_str', 'full_text', 'user_name', 'user_screen_name', 'user_location', 
                                                           'created_at', 'retweet_count', 'favorite_count', 'hashtags'])    
            count = count + 1
            if(len(tweets_response_list) % 100 == 0):
                df.to_csv("output/tweets/nlp_tweet_"+file_name+"_"+s+".csv", index = False)
    except Exception as e:
        print("Exception Caught: ", e)

In [73]:
def searchTweet(q):
    try:
        pages = tweepy.Cursor(API.search_tweets, q=q + " lang:en -filter:retweets", count=100, tweet_mode='extended').pages(100)
        return pages
    except tweepy.TweepyException as e:
        print("Tweepy Exception Caught: ", e)
        print("Sleeping for 2 minutes")
        time.sleep(120)
        pages = tweepy.Cursor(API.search_tweets, q=q + " lang:en -filter:retweets", count=100, tweet_mode='extended').pages(100)
        return pages
    except Exception as e2:
        print("Exception Caught: ", e2)
        print("Sleeping for 2 minutes")
        time.sleep(120)
        pages = tweepy.Cursor(API.search_tweets, q=q + " lang:en -filter:retweets", count=100, tweet_mode='extended').pages(100)
        return pages

In [74]:
def extract_tweets(query, file_name, fresh_load):
    global tweets_response_list, df, count, page_list
    if(not fresh_load):
        print("Tweets extracted in output/tweets/ folder")
    else:
        page_list = []
        count = 0
        tweets_response_list = []
        #for page in tweepy.Cursor(API.search_tweets, q=query + " lang:en -filter:retweets", count=100, tweet_mode='extended').pages(1000):
        for page in searchTweet(query):
            page_list.append(page)
            process_page(page, file_name)
            perc = int(count/100)+1
            if(perc % 10 == 0):
                print(str(perc)+"%", end =" ")
        print("Tweets extracted in output/tweets/ folder")

In [75]:
def extract_imdb_reviews(df_movies_id, fresh_load):
    if(not fresh_load):
        print("Extracted IMDB reviews to output/final_output/full_imdb_review_list.csv")
    else:
        imdb = IMDb()
        all_reviews = []
        df_movies_id['reviews'] = pd.Series(dtype='object')
        for i in df_movies_id.index:
            list_movie_reviews = []
            #print("Fetching reviews for label: ", df_movies_id.loc[i,'Universe'], " Movie Name: ", df_movies_id.loc[i,'Movie_Name'])
            movie_id = df_movies_id.loc[i,'IMDB_Movie_Id']
            movie_reviews = imdb.get_movie(str(movie_id),['reviews'])
            for r in movie_reviews['reviews']:
                list_movie_reviews.append(r['content'])
            try:
                movie_synopsis = imdb.get_movie(str(movie_id),['synopsis'])
                list_movie_reviews.append(movie_synopsis['synopsis'][0])
            except Exception as e:
                print("Exception caught", e)
            all_reviews.append(list_movie_reviews)
        df_movies_id['reviews'] = all_reviews
        df_movies_id.to_csv("output/imdb/imdb_reviews.csv", index = False)
        dfg = df_movies_id.groupby('Universe')['reviews']
        label_review_list = []
        for g in dfg:
            df_t = pd.DataFrame([], columns = ['full_text','label'])
            text = []
            rev = list(g[1].values)
            for r in rev:
                text.extend(r)
            df_t['full_text'] = text
            df_t['label'] = g[0]
            label_review_list.append(df_t)
        final_imdb_df = pd.concat(label_review_list, axis=0, ignore_index=True)
        final_imdb_df.to_csv("output/final_output/full_imdb_review_list.csv", index = False)
        print("Extracted IMDB reviews to output/final_output/full_imdb_review_list.csv")

In [76]:
def clean_tokens(token_list): ## to remove tokens like zzzzz, aa, kkk, one/two letter toekns, aaaanndd, aab
    new_tkn_lst = []
    for tkn in token_list:
        if((len(tkn) >= 3 or tkn == "dc") and len(set(list(tkn))) > 1 and len(re.findall(r'((\w)\2{2,})', tkn)) == 0  and len(re.findall(r'(^(\w)\2{1,})', tkn)) == 0):
            new_tkn_lst.append(tkn)
    return new_tkn_lst

In [77]:
def preprocess_doc(txt, stem, lemma, stop_wrds, selected_tags):
    try:
        txt = txt.lower()
        txt = re.sub(r'http\S+', '', txt) #remove URLs
        txt = re.sub('[^a-zA-Z-]', ' ', txt ) #removing punctuations numbers
        wrd_tkn = word_tokenize(txt)
        wrd_tkn = clean_tokens(wrd_tkn)
        final_wrd_tkn = wrd_tkn
        if(stop_wrds):
            final_wrd_tkn = [word for word in final_wrd_tkn if not word in set(stopwords.words('english')) ]
        if(stem):
            final_wrd_tkn = [pm.stem(word) for word in final_wrd_tkn]
        if(lemma):
            final_wrd_tkn = [lm.lemmatize(word) for word in final_wrd_tkn]
        if(len(selected_tags) > 0):
            final_wrd_tkn = pos_tag(final_wrd_tkn)
            final_wrd_tkn = [word[0] for word in final_wrd_tkn if word[1] in selected_tags ]
        return final_wrd_tkn
    except Exception as e:
        print(txt)
        print("Exception Caught: ", e.reason)
        return []

In [78]:
def extract_stems_lemma(tags, final_input_df, fresh_load):
    if(not fresh_load):
        final_input_df = pd.read_csv("output/final_output/final_input_cleaned_stem_lemma.csv")
        print("Stems and Lemma extracted to output/final_output/final_input_cleaned_stem_lemma.csv")
    else:
        stem_cleaned_tokens = []
        lemma_cleaned_tokens = []
        for i in tqdm(final_input_df.index):
            stem_cleaned_tokens.append(preprocess_doc(txt = final_input_df['full_text'][i], stem = True, lemma = False, stop_wrds = True, selected_tags = tags))
        final_input_df['stem_cleaned_tokens'] = stem_cleaned_tokens
        for i in tqdm(final_input_df.index):
            lemma_cleaned_tokens.append(preprocess_doc(txt = final_input_df['full_text'][i], stem = False, lemma = True, stop_wrds = True, selected_tags = tags))
        final_input_df['lemma_cleaned_tokens'] = lemma_cleaned_tokens
        final_input_df.to_csv("output/final_output/final_input_cleaned_stem_lemma.csv", index = False)
        print("Stems and Lemma extracted to output/final_output/final_input_cleaned_stem_lemma.csv")

In [79]:
def get_class_id(label_name):
    classes_rev = dict((v,k) for k,v in classes.items())
    return classes_rev[label_name]

In [80]:
def evaluate_model(X_train, y_train, X_valid, y_valid, classes, model, model_name, model_file_name, fresh_load):
    if(fresh_load):
        divider = "-"*120
        print(divider)
        model.fit(X_train, y_train)
        dump(model, open('models//'+model_file_name, 'wb'))
        model, score = print_accuracy(X_train, y_train, X_valid, y_valid, classes, model, model_name)
        return model, score
    else:
        model = load(open('models//'+model_file_name, 'rb'))
        model, score = print_accuracy(X_train, y_train, X_valid, y_valid, classes, model, model_name)
        return model, score

In [81]:
def print_accuracy(X_train, y_train, X_valid, y_valid, classes, model, model_name):
    divider = "-"*120
    predicted = model.predict(X_valid)
    train_acc_scr = model.score(X_train, y_train)
    print("Train Accuracy Score of "+model_name+" model created using stemmed tf_idf vector is:\n", train_acc_scr)
    print(divider)
    val_acc_scr = model.score(X_valid, y_valid)
    print("Validation Accuracy Score of "+model_name+" model created using stemmed tf_idf vector is:\n", val_acc_scr)
    print(divider)
    confusion_mat = confusion_matrix(y_true=y_valid, y_pred=predicted)
    print("Confusion Matrix:")
    print(confusion_mat)
    sn.heatmap(confusion_mat, annot=True,fmt="d",cmap=plt.cm.Accent)
    plt.title('Confusion Matrix of '+model_name+' Model', fontsize = 20)
    plt.xlabel('True Label')
    plt.ylabel('Predicted Label')
    plt.show()
    print(classes)
    print(divider)
    pprint(classification_report(y_valid, predicted))
    return model, [train_acc_scr, val_acc_scr]

In [82]:
def get_cross_val(model, X_train, y_train, X_valid, y_valid):
    
    # Fit on train, predict on validation
    clf = model
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_valid)
    
    # Cross validation score over 10 folds
    scores = cross_val_score(clf, X_train, y_train, cv=10)
    print("Cross validation over 10 folds: ", sum(scores)/10.0)
    
    return y_pred

In [83]:
def model_predict_test(test_tfidf_vec_list, model):
    y_label_predict_tfidf = []
    for vec in test_tfidf_vec_list:
        y_predict_tfidf = model.predict(vec)
        y_label_predict_tfidf.append(classes[y_predict_tfidf[0]])
    return y_label_predict_tfidf

In [84]:
def test_model(model, model_name, abbr, df_test, test_tfidf_vec_list):
    y_label_predict_tfidf = model_predict_test(test_tfidf_vec_list, model)
    df_test['predicted_label_'+abbr] = y_label_predict_tfidf
    df_test['y_true'] = df_test['label_name'].apply(get_class_id)
    df_test['y_pred_'+abbr] = df_test['predicted_label_'+abbr].apply(get_class_id)
    y_true = df_test['y_true']
    y_pred = df_test['y_pred_'+abbr]
    test_acc_scr = accuracy_score(y_true, y_pred)
    print("Test Accuracy Score of "+model_name+" Model created using stemmed tf_idf vector is:\n", test_acc_scr)
    return test_acc_scr, df_test

In [85]:
def word_embedding_glove(file, word_index, embedding_dim):
    vocab_size = len(word_index) + 1 # Adding again 1 because of reserved 0 index
    embedding_matrix_vocab = np.zeros((vocab_size, embedding_dim))
    for line in f:
        word, *vector = line.split()
        if(word in word_index):
            idx = word_index[word]
            embedding_matrix_vocab[idx] = np.array(vector, dtype=np.float32)[:embedding_dim]
    return embedding_matrix_vocab

In [87]:
def test_wrd_emb_model(model, model_name, abbr, df_test, x_test_emb):
    y_label_predict_we = model.predict(x_test_emb)
    df_test['predicted_label_'+abbr] = [classes[l] for l in y_label_predict_we]
    df_test['y_true'] = df_test['label_name'].apply(get_class_id)
    df_test['y_pred_'+abbr] = df_test['predicted_label_'+abbr].apply(get_class_id)
    y_true = df_test['y_true']
    y_pred = df_test['y_pred_'+abbr]
    test_acc_scr = accuracy_score(y_true, y_pred)
    print("Test Accuracy Score of "+model_name+" Model created using lemmatized word embedding vector is:\n", test_acc_scr)
    return test_acc_scr, df_test

In [None]:
def glove_fit_transform(glove_path, data):
    word2vec = {}
    embedding = []
    idx2word = []
    with open('G:\spark_big_files\glove.42B\glove.42B.300d.txt', encoding="utf-8") as f:
        for line in f:
            values = line.split()
            word = values[0]
            vec = np.asarray(values[1:], dtype='float32')
            word2vec[word] = vec
            embedding.append(vec)
            idx2word.append(word)
        print('Found %s word vectors.' % len(word2vec))

In [88]:
#reference https://github.com/lazyprogrammer/machine_learning_examples/blob/master/nlp_class2/bow_classifier.py
class GloveVectorizer:
    def __init__(self):
        # load in pre-trained word vectors
        print('Loading word vectors...')
        word2vec = {}
        embedding = []
        idx2word = []
        with open('G:\spark_big_files\glove.42B\glove.42B.300d.txt', encoding="utf-8") as f:
            # is just a space-separated text file in the format:
            # word vec[0] vec[1] vec[2] ...
            for line in f:
                values = line.split()
                word = values[0]
                vec = np.asarray(values[1:], dtype='float32')
                word2vec[word] = vec
                embedding.append(vec)
                idx2word.append(word)
        print('Found %s word vectors.' % len(word2vec))

        # save for later
        self.word2vec = word2vec
        self.embedding = np.array(embedding)
        self.word2idx = {v:k for k,v in enumerate(idx2word)}
        self.V, self.D = self.embedding.shape

    def fit(self, data):
        pass

    def transform(self, data):
        X = np.zeros((len(data), self.D))
        n = 0
        emptycount = 0
        for sentence in data:
            tokens = sentence.lower().split()
            vecs = []
            for word in tokens:
                if word in self.word2vec:
                    vec = self.word2vec[word]
                    vecs.append(vec)
            if len(vecs) > 0:
                vecs = np.array(vecs)
                X[n] = vecs.mean(axis=0)
            else:
                emptycount += 1
            n += 1
        print("Numer of samples with no words found: %s / %s" % (emptycount, len(data)))
        return X

    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)

In [90]:
class Word2VecVectorizer:
    def __init__(self):
        print("Loading in word vectors...")
        self.word_vectors = KeyedVectors.load_word2vec_format(
            'G:\spark_big_files\GoogleNews-vectors-negative300.bin\GoogleNews-vectors-negative300.bin', encoding="utf-8",
            binary=True
        )
        print("Finished loading in word vectors")

    def fit(self, data):
        pass

    def transform(self, data):
        # determine the dimensionality of vectors
        v = self.word_vectors.get_vector('king')
        self.D = v.shape[0]

        X = np.zeros((len(data), self.D))
        n = 0
        emptycount = 0
        for sentence in data:
            tokens = sentence.split()
            vecs = []
            m = 0
            for word in tokens:
                try:
                    # throws KeyError if word not found
                    vec = self.word_vectors.get_vector(word)
                    vecs.append(vec)
                    m += 1
                except KeyError:
                    pass
            if len(vecs) > 0:
                vecs = np.array(vecs)
                X[n] = vecs.mean(axis=0)
            else:
                emptycount += 1
            n += 1
        print("Numer of samples with no words found: %s / %s" % (emptycount, len(data)))
        return X


    def fit_transform(self, data):
        self.fit(data)
        return self.transform(data)