<a href="https://colab.research.google.com/github/mmamel/Twitter_NLP/blob/master/Exploration.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import os
import string
import time
import spacy
import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk import word_tokenize
from nltk.stem import WordNetLemmatizer

#Implementation is based on Nick Koprowicz word count soluion citation: Koprowicz, N (2020) A simple solution using only word counts (Version 17) [Source code Python Notebook] https://www.kaggle.com/nkoprowicz/a-simple-solution-using-only-word-counts. 
#Added code was modifying training data to have non selected text and using spacy to word dependency

##Note this code was not the best scoring and instead is to reflect the exploration part of the assignment.##
##different approachs diverge with the function calculate_selected_text (nonselected text implementation) vs calculate_selected_text_2 (spacy implementation)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [None]:

def replace_string(str1, str2):
    temp1 = str1
    temp2 = str2
    temp = temp1.replace(temp2, "")
    return temp

In [None]:
#Load data
nlp = spacy.load('en')
train = pd.read_csv("train.csv")
test = pd.read_csv("test.csv")
train.dropna(inplace=True)
train['text'] = train['text'].apply(lambda x: x.lower())
test['text'] = test['text'].apply(lambda x: x.lower())

#Split data for training and validation
X_train, X_val = train_test_split(
    train, train_size = 0.80, random_state = 0)

#prevent setting with copy warning by explicitly stating X_train is independent
X_train = X_train.copy()
X_val = X_val.copy()

#Split training basd on sentiment
pos_train = X_train[X_train['sentiment'] == 'positive']
neutral_train = X_train[X_train['sentiment'] == 'neutral']
neg_train = X_train[X_train['sentiment'] == 'negative']
X_train['non_selected'] = X_train.apply(lambda x: replace_string(x['text'],x['selected_text']), axis = 1)

In [None]:
class LemmaTokenizer:
  def __init__(self):
    self.wnl = WordNetLemmatizer()
  def __call__(self, doc):
    return [self.wnl.lemmatize(t) for t in word_tokenize(doc)]


tfidf = CountVectorizer(max_df=0.95, min_df=2,max_features =10000, stop_words='english')

X_train_cv = tfidf.fit_transform(X_train['text'])

X_pos = tfidf.transform(pos_train['text'])
X_neutral = tfidf.transform(neutral_train['text'])
X_neg = tfidf.transform(neg_train['text'])

pos_count_df = pd.DataFrame(X_pos.toarray(), columns=tfidf.get_feature_names())
neutral_count_df = pd.DataFrame(X_neutral.toarray(), columns=tfidf.get_feature_names())
neg_count_df = pd.DataFrame(X_neg.toarray(), columns=tfidf.get_feature_names())



# Create dictionaries of the words within each sentiment group, where the values are the proportions of tweets that 
# contain those words

pos_words = {}
neutral_words = {}
neg_words = {}
non_words = {}

for k in tfidf.get_feature_names():
    pos = pos_count_df[k].sum()
    neutral = neutral_count_df[k].sum()
    neg = neg_count_df[k].sum()
    pos_words[k] = pos/pos_train.shape[0]
    neutral_words[k] = neutral/neutral_train.shape[0]
    neg_words[k] = neg/neg_train.shape[0]
    
X_train_cv_2 = tfidf.fit_transform(X_train['non_selected'])
X_non = tfidf.transform(X_train['non_selected'])

non_count_df = pd.DataFrame(X_non.toarray(), columns = tfidf.get_feature_names())

for k in tfidf.get_feature_names():
    non = non_count_df[k].sum()
    non_words[k] = non/X_train['non_selected'].shape[0]

neg_words_adj = {}
pos_words_adj = {}
neutral_words_adj = {}
non_word_adj = {}

#take out repeats
for key, value in non_words.items():
    non_word_adj[key] = non_words[key]

for key, value in neg_words.items():
    neg_words_adj[key] = neg_words[key] - (neutral_words[key] + pos_words[key])
    
for key, value in pos_words.items():
    pos_words_adj[key] = pos_words[key] - (neutral_words[key] + neg_words[key])
    
for key, value in neutral_words.items():
    neutral_words_adj[key] = neutral_words[key] - (neg_words[key] + pos_words[key])

In [None]:
#APROACH TWO - Simple word count to create vocabulary which is fitted on to test cases to generate a probability
def calculate_selected_text(df_row, tol = 0):
    
    tweet = df_row['text']
    sentiment = df_row['sentiment']
    
    if(sentiment == 'neutral'):
        return tweet
    
    elif(sentiment == 'positive'):
        dict_to_use = pos_words_adj # Calculate word weights using the pos_words dictionary
    elif(sentiment == 'negative'):
        dict_to_use = neg_words_adj # Calculate word weights using the neg_words dictionary
        
    words = tweet.split()
    words_len = len(words)

    subsets = [words[i:j+1] for i in range(words_len) for j in range(i,words_len)]
    score = 0
    selection_str = '' # This will be our choice
    lst = sorted(subsets, key = len) # Sort candidates by length
    doc = nlp(str(words))
    for i in range(len(subsets)):
        new_sum = 0 # Sum for the current substring
        # Calculate the sum of weights for each word in the substring
        for p in range(len(lst[i])):
            if(lst[i][p].translate(str.maketrans('','',string.punctuation)) in dict_to_use.keys()):
                new_sum += dict_to_use[lst[i][p].translate(str.maketrans('','',string.punctuation))]
            if(lst[i][p].translate(str.maketrans('','',string.punctuation)) in non_words.keys()):
                new_sum -= non_word_adj[lst[i][p].translate(str.maketrans('','',string.punctuation))]
        if(new_sum > score + tol):
            score = new_sum
            selection_str = lst[i]
            # tol = tol*5 # Increase the tolerance a bit each time we choose a selection

    # If we didn't find good substrings, return the whole text
    if(len(selection_str) == 0):
        selection_str = words
        
    return ' '.join(selection_str)

In [None]:
#APROACH 3 - Spacy implementation to analyze sentence structuree

def calculate_selected_text_2(df_row, tol = 0):
      
    tweet = df_row['text']
    sentiment = df_row['sentiment']
    
    if(sentiment == 'neutral'):
        return tweet
    
    elif(sentiment == 'positive'):
        dict_to_use = pos_words_adj # Calculate word weights using the pos_words dictionary
    elif(sentiment == 'negative'):
        dict_to_use = neg_words_adj # Calculate word weights using the neg_words dictionary
    text = tweet
    text= re.sub(r"[.]+", " ", text)

    words = text.split()
    words_len = len(words)

    max_word = ''
    max_val = 0
    for i in words:
        curr=0
        if i.translate(str.maketrans("","", string.punctuation)) in dict_to_use.keys() :
            if dict_to_use[i.translate(str.maketrans('','',string.punctuation))] > max_val:
                max_val = dict_to_use[i.translate(str.maketrans('','',string.punctuation))]
                max_word = i 
    selection_str = [max_word]
    doc = nlp(str(tweet))
    for token in doc:
        if token.text == max_word:
            for x in [child for child in token.children]:
                if(str(x) != ' '):
                  selection_str.append(str(x))

    return ' '.join(selection_str)

In [None]:
pd.options.mode.chained_assignment = None
tol = 0.001

X_val['predicted_selection'] = ''

for index, row in X_val.iterrows():
    
    selected_text = calculate_selected_text(row, tol)
    
    X_val.loc[X_val['textID'] == row['textID'], ['predicted_selection']] = selected_text

In [None]:
#Evaluation function 
def jaccard(str1, str2): 
    a = set(str1.lower().split()) 
    b = set(str2.lower().split())
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [None]:
#Run jaccard function on all test cases to find jaccard score
X_val['jaccard'] = X_val.apply(lambda x: jaccard(x['selected_text'], x['predicted_selection']), axis = 1)

print('The jaccard score for the validation set is:', np.mean(X_val['jaccard']))

The jaccard score for the validation set is: 0.6604283711848614
