In [1]:
import pandas as pd
import re
from textblob import TextBlob, Word
from nltk.tokenize import TweetTokenizer
from collections import defaultdict
from nltk.corpus import stopwords
def openTSV(filepath: str):
    return pd.read_csv(filepath, sep='\t')

# function to extract certain keywords
def extractKeywords(tweet: str):
    stops = stopwords.words('english')
    tw = TweetTokenizer(strip_handles=False, reduce_len=True)

    # remove all emoji
    tweet = re.sub(r'\\[a-z0-9]{5}', '', tweet)

    # remove all links
    tweet = re.sub(r'(http:\/\/www\.|https:\/\/www\.|http:\/\/|https:\/\/)?[a-z0-9]+([\-\.]{1}[a-z0-9]+)*\.[a-z]{2,5}(:[0-9]{1,5})?(\/.*)?', '', tweet)
    
    # lower case all words
    tweet.lower()
    # extract and then remove hashtags
    hashtags = re.findall(r"#\w+", tweet)
    tweet = re.sub(r'#\w*', '', tweet)
    
    # extracting nouns and filtering stop words
    tweet = ' '.join(tw.tokenize(tweet))
    blob = TextBlob(tweet)
    nouns = [word for word in blob.noun_phrases if word not in stops]
    nouns = [Word(word) for word in nouns]
    
    # removing all unimportant punctuations
    nouns = [word.strip(' /\\*.;,') for word in nouns if word.strip(' /\\*.;,') != '']
    
    # returning all nouns and hashtags
    keywords = nouns + hashtags
    return keywords


In [2]:

# This is the first stage of preprocessing which just creates a tsv file that has ID, location, and keywords
def preprocess(trainFilePath: str, storeFilePath: str="./data/preprocessed3.csv"):
    st = open(storeFilePath, 'w')
    df = openTSV(trainFilePath)
    
    # writing headers
    st.write('ID\tLocation\tkeywords\n')
    
    # extracting keywords and writing them to the file
    for i, row in df.iterrows():
        keywords = extractKeywords(row[2])
        if keywords == []:
            continue
        st.write("{}\t{}\t{}\n".format(i+1, row[1], ','.join(keywords)))

    st.close()
    return

# removing unnecessary words, part of feature selection
def deleteUnnecessaryWord(h):
    x = defaultdict(lambda: defaultdict(int))         
    
    # iterate through each word
    for k1 in h.keys():
        sum = 0
        # iterate through each user
        for k2 in h[k1].keys():
            sum += h[k1][k2]
        
        # only include if word is mentioned above certain number of times
        if sum >= 10:
            x[k1] = h[k1]
    
    # return the filtered keywords
    return x

# the second stage of preprocessing which turns each keyword into features for each instance
def serializedPreprocessedData(tokenizedCSV: str, furtherPreprocess: str):
    # calculate the frequency of each word of each user
    fp = open(tokenizedCSV, 'r')
    a = open(furtherPreprocess, 'w')
    freq = defaultdict(lambda: defaultdict(int))
    for i in fp:
        x = i[:-1]
        x = x.split('\t')
        # getting the keywords
        keys = x[2].split(',')
        
        # adding the keyword for each user
        for j in keys:
            freq[j][x[0]] += 1
    
    # filtering the unnecessary keywords
    freq = deleteUnnecessaryWord(freq)
    
    # storing data to a csv file
    attr = sorted(freq.keys())
    a.write("{},{},{}\n".format("ID", ','.join(attr), "Class"))
    fp.seek(0, 0)
    fp.readline()
    for i in fp:
        # taking all data except class label
        x = i[:-1].split('\t')
        
        # keep appending feature data
        a.write("{}".format(x[0]))
        for y in attr:
            a.write(",{}".format(freq[y][x[0]]))
        
        # write class label
        a.write(",{}\n".format(x[1]))


    fp.close()
    a.close()


In [3]:
# main data
filepath = "./data/train-raw.tsv"

# file to store data of first stage of preprocessing
tokenizedCSV = "./data/preprocessed.csv"

# file to store data of second stage of preprocessing
furtherPreprocess = "./data/further.csv"

In [5]:
# first stage preprocessing
preprocess(filepath, tokenizedCSV)

# second stage preprocessing
serializedPreprocessedData(tokenizedCSV, furtherPreprocess)

In [6]:
from copy import copy

# a function to change test data into dataframes for classifier use
def fitTestData(fp: str, preprocessed: str):
    data = open(fp, 'r')
    preprocessed = open(preprocessed, 'r')
    # getting the header
    header = preprocessed.readline()
    header = header.split(',')[1:-1]
    
    # user ids
    ids = []
    
    # will contain all data
    vals = {}
    for i in header:
        vals[i] = []
        
    # class labels
    y = []
    
    # iterate through every row, avoiding header
    data.readline()
    for row in data:
        x = row.split('\t')
        
        # extracting keywords
        keys = extractKeywords(x[2])
        
        # getting user id
        ids.append(x[0])
        
        # getting class label
        y.append(x[1])
        
        # calculating frequency of keyword for each instance
        instance = {}
        for i in keys:
            if i not in instance:
                instance[i] = 1
            else:
                instance[i] += 1
        
        # iterate through each keyword
        for i in vals.keys():
            # if an instance has a keyword, the frequency value is appended
            if i in instance:
                vals[i].append(instance[i])
            # all the keywords not in instance are given zero value as it is not in instance
            else:
                vals[i].append(0)
    # converting to dataframe and returning data
    df = pd.DataFrame(vals)
    return ids, df, y

# function to process the processed training data and convert it to dataframe
def fitTrainData(data: str):
    data = pd.read_csv(data)
    return data.iloc[:, 1:-1], data.iloc[:, -1]  


In [7]:
from sklearn.naive_bayes import MultinomialNB, BernoulliNB, GaussianNB
from sklearn.tree import DecisionTreeClassifier

# convert all data
_, X_test, y_test = fitTestData("./data/dev-raw.tsv", furtherPreprocess)
X_train, y_train = fitTrainData(furtherPreprocess)

# all the classifiers to use
classifiers = {"MultinomialNB": MultinomialNB(), "DecisionTree": DecisionTreeClassifier(), "BernoulliNB": BernoulliNB(), "GaussianNB": GaussianNB()}

# printing the classifiers accuracy
for i in classifiers:
    classifiers[i].fit(X_train, y_train)
    score = classifiers[i].score(X_test, y_test)
    print("Classifier: {}, Score: {}\n".format(i, score))


Classifier: MultinomialNB, Score: 0.30338192732340014

Classifier: DecisionTree, Score: 0.30338192732340014

Classifier: BernoulliNB, Score: 0.30710687104727197

Classifier: GaussianNB, Score: 0.296387608532533



 ## Results
 ### Without counting how many users use each word
 1. Classifier: MultinomialNB, Score: 0.30338192732340014, Kaggle Score: 0.30205
 2. Classifier: DecisionTree, Score: 0.3034623217922607, Kaggle Score: 0.28988
 3. Classifier: BernoulliNB, Score: 0.30710687104727197, Kaggle Score: 0.29247
 4. Classifier: GaussianNB, Score: 0.296387608532533
 
 
 ### With counting how many users use each word
 1. Classifier: MultinomialNB, Score: 0.30316754207310537
 2. Classifier: DecisionTree, Score: 0.30322113838567905
 3. Classifier: BernoulliNB, Score: 0.3074820452352878
 4. Classifier: GaussianNB, Score: 0.2960660306570908
 
 
 ### Using binary features (For Bernoulli Only)
 1. Classifier: BernoulliNB, Score: 0.3074820452352878
 2. Classifier: DecisionTree, Score: 0.3034623217922607

In [8]:
# script to test the test-raw.tsv
ids, X_actualTest, _ = fitTestData("./data/test-raw.tsv", furtherPreprocess)

# creating prediction and store it appropriately
predict = classifiers["MultinomialNB"].predict(X_actualTest)
pd.DataFrame({"Id": ids, "Class": list(predict)}).to_csv("./data/result.csv", index=False)
