# Load Data

In [None]:
import pandas as pd
import emoji
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from wordcloud import STOPWORDS
import nltk
from nltk.corpus import stopwords
import csv
import re
import seaborn as sns
from nltk.tokenize import sent_tokenize, word_tokenize
nltk.download('punkt')
from nltk.stem.snowball import SnowballStemmer
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
from nltk.stem import WordNetLemmatizer 
nltk.download('wordnet')

df=pd.read_csv(r'C:\Users\Mhaiskao\Desktop\Assignments\DMTA\olid-training-v1.0.tsv',sep='\t', encoding="utf-8",quotechar='\0')


In [None]:
df.head()

# Data Preparation

In [None]:
def preparation(df):

    #Convert to lower case
    df['tweet']=df['tweet'].str.lower()

    for i in range(0,len(df['tweet'])):
        #Removal of User Id
        df['tweet'][i] = re.sub("user", "", df['tweet'][i])

        #Removal of URL
        df['tweet'][i] = re.sub("url", "", df['tweet'][i])

        #Converting emojis to corresponding words
        df['tweet'][i]=emoji.demojize(df['tweet'][i], delimiters=(""," "))
        
        #Removal of Punctuations
#         df['tweet'][i] = ' '.join(re.sub("[\.\,\!\?\:\;\-\=\"\’\']", " ", df['tweet'][i]).split())
#         df['tweet'] = df['tweet'].str.replace("[^a-zA-Z#]"," ")

        df['tweet'] = df['tweet'].str.replace("[\.\,\!\?\:\;\-\=\"\’\'\@\+\~]","")

        


    #Remove words with length less the 3
    df['tweet'] = df['tweet'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>3]))
    df['tidy_tweet']=df['tweet']
    df.update("'" + df[['tweet']].astype(str) + "'")
    
    return df


df['subtask_c'].fillna("NULL",inplace = True)
df['subtask_b'].fillna("NULL",inplace = True)
df=preparation(df)
# df.to_csv(r"C:\Users\Mhaiskao\Desktop\Assignments\DMTA\training_updated_tweet.arff", header=False,index=False,mode='a',quotechar="'",quoting=3,escapechar='"')

# Plot Graph for Targets

In [None]:
def graph_subtask(subtask):
    df[subtask].value_counts(normalize=True)*100
    sns.countplot(x=subtask, data=df)

graph_subtask('subtask_a')


In [None]:
graph_subtask('subtask_b')

In [None]:
graph_subtask('subtask_c')

In [None]:
df

# Remove Stop Words

In [None]:
def remove_stopwords(df):
    nltk.download('stopwords')

    df_without_stopwords = pd.DataFrame(columns=['tidy_tweet','subtask_a','subtask_b','subtask_c'])
    without_wordlist=[]
    #NLTK stop words list
    stop_words_list = stopwords.words('english')

    #Combine wordcloud and NLTK stop words
    stop_words = ["will","take","should've"] + list(STOPWORDS)+stop_words_list

    for i in range(len(df)):
        querywords = df['tidy_tweet'][i].split()
        resultwords  = [word for word in querywords if word.lower() not in stop_words]
        without_wordlist.append(' '.join(resultwords))

    df_without_stopwords['tidy_tweet'] = without_wordlist
    df_without_stopwords['subtask_a']=df['subtask_a']
    df_without_stopwords['subtask_b']=df['subtask_b']
    df_without_stopwords['subtask_c']=df['subtask_c']
    return df_without_stopwords


In [None]:
df_without_stopwords=remove_stopwords(df)
df['tidy_tweet']=df_without_stopwords['tidy_tweet']

# Stemming and Lemmatization

In [None]:
def Tokenize(df_stemming,df_without_stopwords):
    df_stemming['tokenized']=df_without_stopwords['tidy_tweet'].apply(lambda x: x.split())
    
def stemSentence(sentence):
    porter = PorterStemmer()
    token_words=word_tokenize(sentence)
    token_words
    stem_sentence=[]
    for word in token_words:
        stem_sentence.append(porter.stem(word))
        stem_sentence.append(" ")
    return "".join(stem_sentence)

def WordTokenize(df_stemming,df_without_stopwords):
    df_stemming['word_tokenize']=df_without_stopwords['tidy_tweet'].apply(lambda x: stemSentence(x))
     
def SnowballStemmer1(df_stemming,df_without_stopwords):
    englishStemmer=SnowballStemmer("english")
    df_stemming['SnowballStemmer123']=df_without_stopwords['tidy_tweet'].apply(lambda x: englishStemmer.stem(x))
    
def Porter(df_stemming,df_without_stopwords):
    porter = PorterStemmer()
    df_stemming['Porter']=df_without_stopwords['tidy_tweet'].apply(lambda x: porter.stem(x))

def Lancaster(df_stemming,df_without_stopwords):
    lancaster=LancasterStemmer()
    df_stemming['Lancaster']=df_without_stopwords['tidy_tweet'].apply(lambda x: lancaster.stem(x))
    
def Lemmatizer(df_stemming,df_without_stopwords):
    lemmatizer = WordNetLemmatizer() 
    df_stemming['Lemmatized']=df_without_stopwords['tidy_tweet'].apply(lambda x: lemmatizer.lemmatize(x))

def StemmingAndLemma(df_stemming,df_without_stopwords):
    #Word Tokenize using word_tokenize    
    WordTokenize(df_stemming,df_without_stopwords)

    #SnowballStemmer
    SnowballStemmer1(df_stemming,df_without_stopwords)

    #Tokenizer
    Tokenize(df_stemming,df_without_stopwords)

    #Porter Stemmer
    Porter(df_stemming,df_without_stopwords)

    #Lancaster Stemmer
    Lancaster(df_stemming,df_without_stopwords)

    #Lemmatizer 
    Lemmatizer(df_stemming,df_without_stopwords)

    return df_stemming

df_stemming=pd.DataFrame()
df_stemming['tidy_tweet']=df_without_stopwords['tidy_tweet']

df_stemming=StemmingAndLemma(df_stemming,df_without_stopwords)

# Remove words having frequency less than 3 from each Stemmed and Lematized Tweet

In [None]:
from collections import defaultdict  # available in Python 2.5 and newer

def get_frequency_of_words(df_stemming,column_name):
    print('Get Frequency for: ',column_name)
    Tweet_list=df_stemming[column_name]
    Dictionary_Tweet= defaultdict(int)
    Tweet_word_list=[]
    def update(i):
        Dictionary_Tweet[i] += 1
    [[ update(i) for i in tweet.split()] for tweet in Tweet_list]

    for key,value in Dictionary_Tweet.items():
        if value>3:
            Tweet_word_list.append(key)
    return Tweet_word_list

def remove_words_from_tweet(df_stemming,columnname,word_list):
    for i in range(len(df_stemming[columnname])):
        resultwords = [word for word in df_stemming[columnname][i].split() if word in word_list]
        if len(resultwords)!=0:
            df_stemming[columnname][i] = ' '.join(resultwords)
    return df_stemming

In [None]:

def frequency(df_stemming):
    #Get words with minimun 3 frequency for all stemming types
    tidy_tweet_list=get_frequency_of_words(df_stemming,'tidy_tweet')
    df_stemming=remove_words_from_tweet(df_stemming,'tidy_tweet',tidy_tweet_list)

    word_tokenize_list=get_frequency_of_words(df_stemming,'word_tokenize')
    df_stemming=remove_words_from_tweet(df_stemming,'word_tokenize',word_tokenize_list)

    SnowballStemmer123_list=get_frequency_of_words(df_stemming,'SnowballStemmer123')
    df_stemming=remove_words_from_tweet(df_stemming,'SnowballStemmer123',SnowballStemmer123_list)

    Porter_list=get_frequency_of_words(df_stemming,'Porter')
    df_stemming=remove_words_from_tweet(df_stemming,'Porter',Porter_list)

    Lancaster_list=get_frequency_of_words(df_stemming,'Lancaster')
    df_stemming=remove_words_from_tweet(df_stemming,'Lancaster',Lancaster_list)

    Lemmatized_list=get_frequency_of_words(df_stemming,'Lemmatized')
    df_stemming=remove_words_from_tweet(df_stemming,'Lemmatized',Lemmatized_list)
    
    return df_stemming
df_stemming=frequency(df_stemming)

# Final Upated Tweet's For Subtask A

In [None]:
df['tidy_tweet_updated']=df_stemming['tidy_tweet']
df['word_tokenize']=df_stemming['word_tokenize']
df['SnowballStemmer123']=df_stemming['SnowballStemmer123']
df['Porter']=df_stemming['Porter']
df['Lancaster']=df_stemming['Lancaster']
df['Lemmatized']=df_stemming['Lemmatized']
df.update("'" + df[['tidy_tweet','tidy_tweet_updated','word_tokenize','SnowballStemmer123','Porter','Lancaster','Lemmatized']].astype(str) + "'")
del df['id']
df

# Create ARFF File for Subtask A

In [None]:
column=['tidy_tweet_updated','word_tokenize','SnowballStemmer123','Porter','Lancaster','Lemmatized']

for col in column:
    f= open(r"C:\Users\Mhaiskao\Desktop\Assignments\DMTA\Maverick\SubTaskA\{}_subtaskA.arff".format(col),"a+")
    f.write("@relation Train\n")
    f.write("@attribute {} string\n".format(col))
    f.write("@attribute subtask_a {NOT,OFF}\n")
    f.write("@data\n")
    f.close()
    df[[col,'subtask_a']].to_csv(r"C:\Users\Mhaiskao\Desktop\Assignments\DMTA\Maverick\SubTaskA\{}_subtaskA.arff".format(col), header=False,index=False,mode='a',quotechar="'",quoting=3,escapechar='"')


# Generate Word Cloud for Subtask A


In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

def Plot_Wordcloud(df,columnname,subtask,target):
    nltk.download('stopwords')
    #NLTK stop words list
    stop_words_list = stopwords.words('english')

    #Combine wordcloud and NLTK stop words
    stop_words = ["will","take","should've"] + list(STOPWORDS)+stop_words_list

    all_words=' '.join([text for text in df[columnname][df[subtask]==target]])

    wordcloud=WordCloud(stopwords = stop_words,width=1800,height=1500,max_words=100000).generate(all_words)


    plt.figure(figsize=(20,10),facecolor='k')
    plt.imshow(wordcloud)
    plt.axis('off')
    plt.show()
    
Plot_Wordcloud(df,'tidy_tweet','subtask_a','OFF')

# Generate N Gram

In [None]:
from textblob import TextBlob
 
# Function to generate n-grams from sentences.
def extract_ngrams(data, num):
    n_grams = TextBlob(data).ngrams(num)
    return [ ' '.join(grams) for grams in n_grams]

def generate_ngram(df):
    ngram_list=[]
    ngram_subtask_a=[]
    ngram_subtask_b=[]
    ngram_subtask_c=[]

    for i in range(len(df)):
        querywords = df['tidy_tweet'][i].split()
        resultwords  = [word for word in extract_ngrams(df['tidy_tweet'][i], 2)]

        for words in resultwords:
            ngram_list.append(words)
            ngram_subtask_a.append(df['subtask_a'][i])
            ngram_subtask_b.append(df['subtask_b'][i])
            ngram_subtask_c.append(df['subtask_c'][i])

    df_ngram=pd.DataFrame()
    df_ngram['ngram']=ngram_list
    df_ngram['subtask_a']=ngram_subtask_a
    df_ngram['subtask_b']=ngram_subtask_b
    df_ngram['subtask_c']=ngram_subtask_c

    return df_ngram

df_ngram=generate_ngram(df)

df_ngram


# Retrieve Tweets for Subtask B 

In [None]:
df_subtask_b=pd.DataFrame()
df_subtask_b = df.ix[(df['subtask_a'] == 'OFF'),['tidy_tweet','subtask_a','subtask_b','subtask_c']]
df_subtask_b = df_subtask_b.reset_index()
del df_subtask_b['index']
df_subtask_b

In [None]:
df_subtask_b['tidy_tweet']=df_subtask_b['tidy_tweet'].str.replace("'","")

# Stemming and Lemmatization for Subtask B

In [None]:
df_stemming_b=pd.DataFrame()
df_stemming_b['tidy_tweet']=df_subtask_b['tidy_tweet']

df_stemming_b=StemmingAndLemma(df_stemming_b,df_subtask_b)

# Remove words with frequency less than 3

In [None]:
df_stemming_b=frequency(df_stemming_b)

# Final Updated Tweet's for Subtask_B

In [None]:
df_subtask_b['tidy_tweet_updated']=df_stemming_b['tidy_tweet']
df_subtask_b['word_tokenize']=df_stemming_b['word_tokenize']
df_subtask_b['SnowballStemmer123']=df_stemming_b['SnowballStemmer123']
df_subtask_b['Porter']=df_stemming_b['Porter']
df_subtask_b['Lancaster']=df_stemming_b['Lancaster']
df_subtask_b['Lemmatized']=df_stemming_b['Lemmatized']

df_subtask_b.update("'" + df_subtask_b[['tidy_tweet','tidy_tweet_updated','word_tokenize','SnowballStemmer123','Porter','Lancaster','Lemmatized']].astype(str) + "'")
df_subtask_b

# Create ARFF File for Subtask B

In [None]:
column=['tidy_tweet_updated','word_tokenize','SnowballStemmer123','Porter','Lancaster','Lemmatized']

for col in column:
    f= open(r"C:\Users\Mhaiskao\Desktop\Assignments\DMTA\Maverick\SubTaskB\{}_subtaskB.arff".format(col),"a+")
    f.write("@relation Train\n")
    f.write("@attribute {} string\n".format(col))
    f.write("@attribute subtask_b {UNT,TIN}\n")
    f.write("@data\n")
    f.close()
    df_subtask_b[[col,'subtask_b']].to_csv(r"C:\Users\Mhaiskao\Desktop\Assignments\DMTA\Maverick\SubTaskB\{}_subtaskB.arff".format(col), header=False,index=False,mode='a',quotechar="'",quoting=3,escapechar='"')


# WordCloud for Subtask B

In [None]:
Plot_Wordcloud(df_subtask_b,'tidy_tweet','subtask_b','TIN')

# Retrieve Tweets for Subtask C

In [None]:
df_subtask_c=pd.DataFrame()
df_subtask_c = df_subtask_b.ix[(df_subtask_b['subtask_b'] == 'TIN'),['tidy_tweet','subtask_a','subtask_b','subtask_c']]
df_subtask_c = df_subtask_c.reset_index()
del df_subtask_c['index']

df_subtask_c

In [None]:
df_subtask_c['tidy_tweet']=df_subtask_c['tidy_tweet'].str.replace("'","")

# Stemming and Lemmatization for Subtask C

In [None]:
df_stemming_c=pd.DataFrame()
df_stemming_c['tidy_tweet']=df_subtask_c['tidy_tweet']

df_stemming_c=StemmingAndLemma(df_stemming_c,df_subtask_c)

# Remove words with frequency less than 3

In [None]:
df_stemming_c=frequency(df_stemming_c)

# Final Updated Tweet's for Subtask_C

In [None]:
df_subtask_c['tidy_tweet_updated']=df_stemming_c['tidy_tweet']
df_subtask_c['word_tokenize']=df_stemming_c['word_tokenize']
df_subtask_c['SnowballStemmer123']=df_stemming_c['SnowballStemmer123']
df_subtask_c['Porter']=df_stemming_c['Porter']
df_subtask_c['Lancaster']=df_stemming_c['Lancaster']
df_subtask_c['Lemmatized']=df_stemming_c['Lemmatized']
df_subtask_c.update("'" + df_subtask_c[['tidy_tweet','tidy_tweet_updated','word_tokenize','SnowballStemmer123','Porter','Lancaster','Lemmatized']].astype(str) + "'")

df_subtask_c

# Create ARFF File for Subtask C

In [None]:
column=['tidy_tweet_updated','word_tokenize','SnowballStemmer123','Porter','Lancaster','Lemmatized']

for col in column:
    f= open(r"C:\Users\Mhaiskao\Desktop\Assignments\DMTA\Maverick\SubTaskC\{}_subtaskC.arff".format(col),"a+")
    f.write("@relation Train\n")
    f.write("@attribute {} string\n".format(col))
    f.write("@attribute subtask_c {IND,OTH,GRP}\n")
    f.write("@data\n")
    f.close()
    df_subtask_c[[col,'subtask_c']].to_csv(r"C:\Users\Mhaiskao\Desktop\Assignments\DMTA\Maverick\SubTaskC\{}_subtaskC.arff".format(col), header=False,index=False,mode='a',quotechar="'",quoting=3,escapechar='"')


# WordCloud for Subtask C

In [None]:
Plot_Wordcloud(df_subtask_c,'tidy_tweet','subtask_c','IND')

# **********  Test File  **********


In [None]:
df_test_a=pd.read_csv(r'C:\Users\Mhaiskao\Desktop\Assignments\DMTA\test_a.csv',sep=',', encoding="utf-8",quotechar='\0')
df_test_b=pd.read_csv(r'C:\Users\Mhaiskao\Desktop\Assignments\DMTA\test_b.csv',sep=',', encoding="utf-8",quotechar='\0')
df_test_c=pd.read_csv(r'C:\Users\Mhaiskao\Desktop\Assignments\DMTA\test_c.csv',sep=',', encoding="utf-8",quotechar='\0')

# Remove Punctuations

In [None]:
df_test_a=preparation(df_test_a)
df_test_b=preparation(df_test_b)
df_test_c=preparation(df_test_c)


# Remove Stop Words


In [None]:
def remove_stopwords_test(df_test):
    nltk.download('stopwords')
    without_wordlist=[]
    #NLTK stop words list
    stop_words_list = stopwords.words('english')

    #Combine wordcloud and NLTK stop words
    stop_words = ["will","take","should've"] + list(STOPWORDS)+stop_words_list

    for i in range(len(df_test)):
        querywords = df_test['tidy_tweet'][i].split()
        resultwords  = [word for word in querywords if word.lower() not in stop_words]
        without_wordlist.append(' '.join(resultwords))

    df_test['tidy_tweet'] = without_wordlist

    return df_test

In [None]:
df_test_a=remove_stopwords_test(df_test_a)
df_test_b=remove_stopwords_test(df_test_b)
df_test_c=remove_stopwords_test(df_test_c)

# Stemming and Lemmatization

In [None]:
#Test Set A
df_stemming_test_a=pd.DataFrame()
df_stemming_test_a['tidy_tweet']=df_test_a['tidy_tweet']
df_stemming_test_a=StemmingAndLemma(df_stemming_test_a,df_test_a)


#Test Set B
df_stemming_test_b=pd.DataFrame()
df_stemming_test_b['tidy_tweet']=df_test_b['tidy_tweet']
df_stemming_test_b=StemmingAndLemma(df_stemming_test_b,df_test_b)

#Test Set C

df_stemming_test_c=pd.DataFrame()
df_stemming_test_c['tidy_tweet']=df_test_c['tidy_tweet']
df_stemming_test_c=StemmingAndLemma(df_stemming_test_c,df_test_c)


In [None]:
df_test_a['tidy_tweet_updated']=df_stemming_test_a['tidy_tweet']
df_test_a['word_tokenize']=df_stemming_test_a['word_tokenize']
df_test_a['SnowballStemmer123']=df_stemming_test_a['SnowballStemmer123']
df_test_a['Porter']=df_stemming_test_a['Porter']
df_test_a['Lancaster']=df_stemming_test_a['Lancaster']
df_test_a['Lemmatized']=df_stemming_test_a['Lemmatized']
df_test_a.update("'" + df_test_a[['tidy_tweet','tidy_tweet_updated','word_tokenize','SnowballStemmer123','Porter','Lancaster','Lemmatized']].astype(str) + "'")
del df_test_a['id']
del df_test_a['tweet']
df_test_a

In [None]:
df_test_b['tidy_tweet_updated']=df_stemming_test_b['tidy_tweet']
df_test_b['word_tokenize']=df_stemming_test_b['word_tokenize']
df_test_b['SnowballStemmer123']=df_stemming_test_b['SnowballStemmer123']
df_test_b['Porter']=df_stemming_test_b['Porter']
df_test_b['Lancaster']=df_stemming_test_b['Lancaster']
df_test_b['Lemmatized']=df_stemming_test_b['Lemmatized']
df_test_b.update("'" + df_test_b[['tidy_tweet','tidy_tweet_updated','word_tokenize','SnowballStemmer123','Porter','Lancaster','Lemmatized']].astype(str) + "'")
del df_test_b['id']
del df_test_b['tweet']
df_test_b

In [None]:
df_test_c['tidy_tweet_updated']=df_stemming_test_c['tidy_tweet']
df_test_c['word_tokenize']=df_stemming_test_c['word_tokenize']
df_test_c['SnowballStemmer123']=df_stemming_test_c['SnowballStemmer123']
df_test_c['Porter']=df_stemming_test_c['Porter']
df_test_c['Lancaster']=df_stemming_test_c['Lancaster']
df_test_c['Lemmatized']=df_stemming_test_c['Lemmatized']
df_test_c.update("'" + df_test_c[['tidy_tweet','tidy_tweet_updated','word_tokenize','SnowballStemmer123','Porter','Lancaster','Lemmatized']].astype(str) + "'")
del df_test_c['id']
del df_test_c['tweet']
df_test_c

In [None]:
df_test_c['subtask_c_weka']='?'
df_test_b['subtask_b_weka']='?'
df_test_a['subtask_a_weka']='?'


# Create ARFF Files for Test Set A, B, C

In [None]:
column=['tidy_tweet_updated','word_tokenize','SnowballStemmer123','Porter','Lancaster','Lemmatized']

for col in column:
    f= open(r"C:\Users\Mhaiskao\Desktop\Assignments\DMTA\Maverick\SubTaskA_TEST\{}_subtaskA_Test.arff".format(col),"a+")
    f.write("@relation Test\n")
    f.write("@attribute {} string\n".format(col))
    f.write("@attribute subtask_a {NOT,OFF}\n")
    f.write("@data\n")
    f.close()
    df_test_a[[col,'subtask_a']].to_csv(r"C:\Users\Mhaiskao\Desktop\Assignments\DMTA\Maverick\SubTaskA_TEST\{}_subtaskA_Test.arff".format(col), header=False,index=False,mode='a',quotechar="'",quoting=3,escapechar='"')


In [None]:
column=['tidy_tweet_updated','word_tokenize','SnowballStemmer123','Porter','Lancaster','Lemmatized']

for col in column:
    f= open(r"C:\Users\Mhaiskao\Desktop\Assignments\DMTA\Maverick\SubTaskB_TEST\{}_subtaskB_Test.arff".format(col),"a+")
    f.write("@relation Test\n")
    f.write("@attribute {} string\n".format(col))
    f.write("@attribute subtask_b {UNT,TIN}\n")
    f.write("@data\n")
    f.close()
    df_test_b[[col,'subtask_b']].to_csv(r"C:\Users\Mhaiskao\Desktop\Assignments\DMTA\Maverick\SubTaskB_TEST\{}_subtaskB_Test.arff".format(col), header=False,index=False,mode='a',quotechar="'",quoting=3,escapechar='"')


In [None]:
column=['tidy_tweet_updated','word_tokenize','SnowballStemmer123','Porter','Lancaster','Lemmatized']

for col in column:
    f= open(r"C:\Users\Mhaiskao\Desktop\Assignments\DMTA\Maverick\SubTaskC_TEST\{}_subtaskC_Test.arff".format(col),"a+")
    f.write("@relation Test\n")
    f.write("@attribute {} string\n".format(col))
    f.write("@attribute subtask_c {IND,GRP,OTH}\n")
    f.write("@data\n")
    f.close()
    df_test_c[[col,'subtask_c']].to_csv(r"C:\Users\Mhaiskao\Desktop\Assignments\DMTA\Maverick\SubTaskC_TEST\{}_subtaskC_Test.arff".format(col), header=False,index=False,mode='a',quotechar="'",quoting=3,escapechar='"')
