In [None]:
import pandas as pd
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
import string
import numpy as np

# Training

In [None]:
train_data = pd.read_csv('training_twitter_x_y_train.csv')

In [None]:
X_train = train_data['text']
Y_train = train_data['airline_sentiment'] 

In [None]:
for i in range(len(Y_train)):
    if(Y_train.iloc[i] == 'positive'):
        Y_train.iloc[i] = 0
    elif(Y_train.iloc[i] == 'negative'):
        Y_train.iloc[i] = 1
    else:
        Y_train.iloc[i] = 2

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [None]:
classes = ['positive', 'negative', 'neutral']

In [None]:
def get_simple_pos_tag(nltk_pos_tag):
    if nltk_pos_tag.startswith('J'):
        return wordnet.ADJ
    elif nltk_pos_tag.startswith('V'):
        return wordnet.VERB
    elif nltk_pos_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [None]:
#function to preprocess the words list to remove punctuations
def preprocess(words_list):
    #we create an empty translation table where every character in the first argument is mapped to
    #every character in the second argument and every character in the third argument is mapped to
    #none  
    
    #" \t " in a word becomes none
    translation_table = str.maketrans('', '', '\t')
    words_list = [word.translate(translation_table) for word in words_list]
    
    #" ' " appears in a lot of words and would change the meaning of the words if removed,
    #hence it is removed from the list of punctuations we plan to remove from the words
    punctuations = (string.punctuation).replace("'", "") 
    #all punctuation characters become none 
    translation_table = str.maketrans('', '', punctuations)
    words_list = [word.translate(translation_table) for word in words_list]
    
    #removing blank strings
    words_list = [word for word in words_list if word]
    
    #some words are quoted in the documents and as we have not removed " ' " to maintain 
    #the meaning of the words, we try to unquote such words below
    for i in range(len(words_list)):
        if ((words_list[i][0] == "'") and (words_list[i][-1] == "'")):
            words_list[i] = words_list[i][1:-1]
        elif(words_list[i][0] == "'"):
            words_list[i] = words_list[i][1:]
        
    #we will also remove just numeric strings as they do not have any significant meaning in 
    #text classification
    words_list = [word for word in words_list if not word.isdigit()]
    
    #removing blank strings
    words_list = [word for word in words_list if word]
    
    #making all words lower-case
    #words_list = [word.lower() for word in words_list]
    
    #removing words with two or less characters
    words_list = [word for word in words_list if (len(word) > 2)]
    
    return words_list

In [None]:
stop_words = set(stopwords.words('english'))
punctuation = list(string.punctuation)
stop_words.update(punctuation)

lemmatizer = WordNetLemmatizer() 

def clean_review(word_list):
    word_list = preprocess(word_list)
    final_words = []
    for word in word_list:
        if word.lower() not in stop_words:
            pos = pos_tag([word])
            clean_word = lemmatizer.lemmatize(word, pos = get_simple_pos_tag(pos[0][1]))
            final_words.append(clean_word.lower())
    return final_words

In [None]:
training_tweets = []

for tweet in X_train:
    final_tweet = " ".join(clean_review(tweet.split(' ')))
    training_tweets.append(final_tweet)

In [None]:
# count_vectorizer = CountVectorizer(max_features = 5000, max_df = 1, min_df = 1, ngram_range = (1, 1))
# X_train_transformed = count_vectorizer.fit_transform(training_tweets)
# Y_train = np.asarray(Y_train, dtype = 'int')

In [None]:
tfidf_vectorizer = TfidfVectorizer(max_features = 5000, max_df = 1, min_df = 1, ngram_range = (1, 1))
X_train_transformed = tfidf_vectorizer.fit_transform(training_tweets)
Y_train = np.asarray(Y_train, dtype = 'int')

In [None]:
svc = SVC()
svc.fit(X_train_transformed, Y_train)
svc.score(X_train_transformed, Y_train)

0.7410746812386156

In [None]:
mnb = MultinomialNB()
mnb.fit(X_train_transformed, Y_train)
mnb.score(X_train_transformed, Y_train)

0.6467213114754098

In [None]:
dt = DecisionTreeClassifier()
dt.fit(X_train_transformed, Y_train)
dt.score(X_train_transformed, Y_train)

0.7410746812386156

# Testing

In [None]:
test_data = pd.read_csv('test_twitter_x_test.csv')

In [None]:
X_test = test_data['text']

In [None]:
testing_tweets = []

for tweet in X_test:
    final_tweet = " ".join(clean_review(tweet.split(' ')))
    testing_tweets.append(final_tweet)

In [None]:
X_test_transformed = tfidf_vectorizer.transform(testing_tweets)
Y_predictions = svc.predict(X_test_transformed)

In [None]:
data_type = np.dtype('U25')
result = np.empty((len(Y_predictions)), dtype = data_type) 
for i in range(len(Y_predictions)):
    result[i] = classes[Y_predictions[i]]
np.savetxt("submission.csv", result, fmt = '%s')