In [None]:
import pandas as pd
import re
import nltk
import string
from nltk.corpus import stopwords
import spacy

from sklearn.metrics import accuracy_score,classification_report, f1_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [None]:
nlp = spacy.load('en', disable=['parser', 'ner'])

In [None]:
# DATA CLEANING AND PREPARATION #
class Utils(object):

    def cleanText(self, text):
        review = re.sub(r"^https://t.co/[a-zA-Z0-9]*\s", " ", str(text))
        review = re.sub(r"\([\s\S]*\)", " ", str(review))
        review = re.sub(r"\s+https://t.co/[a-zA-Z0-9]*\s", " ", str(review))
        review = re.sub(r"\s+https://t.co/[a-zA-Z0-9]*$", " ", str(review))
        review = review.lower()
        review = re.sub(r"that's", "that is", str(review))
        review = re.sub(r"there's", "there is", str(review))
        review = re.sub(r"what's", "what is", str(review))
        review = re.sub(r"where's", "where is", str(review))
        review = re.sub(r"it's", "it is", str(review))
        review = re.sub(r"who's", "who is", str(review))
        review = re.sub(r"i'm", "i am", str(review))
        review = re.sub(r"she's", "she is", str(review))
        review = re.sub(r"he's", "he is", str(review))
        review = re.sub(r"they're", "they are", str(review))
        review = re.sub(r"who're", "who are", str(review))
        review = re.sub(r"ain't", "am not", str(review))
        review = re.sub(r"wouldn't", "would not", str(review))
        review = re.sub(r"shouldn't", "should not", str(review))
        review = re.sub(r"can't", "can not", str(review))
        review = re.sub(r"couldn't", "could not", str(review))
        review = re.sub(r"won't", "will not", str(review))
        review = re.sub(r" pm ", " ", str(review))
        review = re.sub(r" am ", " ", str(review))
        review = re.sub(r'[^\[\]]+(?=\])', " ", str(review))
        review = re.sub(r"\W", " ", str(review))
        review = re.sub(r"\d", " ", str(review))
        review = re.sub(r"\s+[a-z]\s+", " ", str(review))
        review = re.sub(r"\s+[a-z]$", " ", str(review))
        review = re.sub(r"^[a-z]\s+", " ", str(review))
        review = re.sub(r"\s+", " ", str(review))
        return review

    def remove_punc(self, text):
        table = str.maketrans("", "", string.punctuation)
        return text.translate(table)

    def remove_emoticon(self, text):
        emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags 
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
        return emoji_pattern.sub(r'', text)


    
    def lemmatization(self, text):
        doc = nlp(text)
        return " ".join([token.lemma_ for token in doc])

    nltk.download('stopwords')
    def remove_stops(self, text):
        stop_words = set(nltk.corpus.stopwords.words('english'))
        text = [word.lower() for word in text.split() if word.lower() not in stop_words]
        return " ".join(text)


    def readData1(self, path, inputColumnIndex=0, outputColumnIndex=1):
        df = pd.read_csv(path, error_bad_lines=False, sep='\t')
        X = df.iloc[:, inputColumnIndex].values
        y = df.iloc[:, outputColumnIndex].values
        return X,y
    
    def readData2(self, path, inputColumnIndex=1, outputColumnIndex=2):
        df = pd.read_csv(path, error_bad_lines=False, sep='\t')
        X = df.iloc[:, inputColumnIndex].values
        y = df.iloc[:, outputColumnIndex].values
        return X,y
    
    def draw_prediction_results(self, y_pred, y_test, my_tags, method):
        print('accuracy of ' + method + ': %s' % accuracy_score(y_pred, y_test))
        print(classification_report(y_test, y_pred, target_names=my_tags, digits = 6))
    
    def create_csv(self, y_pred, y_test, classifier):
        report = classification_report(y_test, y_pred)
        lines = report.split('\n')
        line = lines[-2].split()
        line.remove('weighted')
        line[0] = classifier
        df = pd.DataFrame(line).transpose()
        csv_data = df.to_csv(index=False)
        df.to_csv('Kannada.csv',  mode='a', header=False, index=False)
        
    
    def crossValidation(self, prediction, input, output, k=5):
        scores = cross_val_score(prediction, input,output, cv=k)
        print("Accuracy of Cross Validation Mean: %0.6f (+/- %0.6f)" % (scores.mean(), scores.std() * 2))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# FUNCTION CALL FOR DATA CLEANING AND PREPARATION #
utils = Utils()
X1,y_train=utils.readData1('kannada_sentiment_full_train.tsv')

ourTags =['not-Kannada', 'unknown_state', 'Positive', 'Mixed_feelings', 'Negative']
X_train=[]

for i in range(0, len(X1)):
    t = utils.cleanText(X1[i])
    t = utils.remove_emoticon(t)
    t = utils.remove_punc(t)
    t = utils.remove_stops(t)
    t = utils.lemmatization(t)
    X_train.append(t)

print(X_train[:10])
#X_train, X_test, y_train, y_test = train_test_split(corpus, y, test_size=0.3, random_state=0)

['ಒ ದ ದ ಶದ ಮ ದ ವರ ಯ ವ ದ ಅದರ ಆರ ಥ ಕ ಸ ಥ ತ ಯನ ನ ಅವಲ ಬ ಸ ವ ದ ಲ ಲ ಅವರ ಮ ನವ ಯತ ಯಲ ಲ ಎಷ ಟ ಸಮರ ಥರ ಎನ ನ ವ ದನ ನ ಅವಲ ಬ ಸ ದ ಭ ರತದಲ ಲ ಅನಕ ಷರತ ಇದ ಆ ಅನಕ ಷರಸ ಥರನ ನ ಅವರ ಅನಕ ಷರತ ಯ', 'ಕನ ನಡದಲ ಲ ಡ ಲ ಟ ಕ ಅಪ ಡ ಟ ಸ ಪಡ ಯಲ ಸಬ ಸ ಕ ರ ಬ ಮ ಡ ನಮ ಮ ಚನ ನ ಲ ಗ', 'super sar song', 'tiktoker present situation nನ ಡ ವವರ ಯ ರ ನಮ ಮ ವ ಡ ಯ ನ', 'super ಸ ಗ ವ ರ ನ ಸ', 'varshakke thagadu movie madi industry haal mado hero galu ondh kade adrenn varsha kasta pattu ondhu olle film mado namma rakshith shetty haagu yash innondh kade', 'tickets amount adru mosa illa love', 'super super super film explain', 'wild rex ಕಟ ಟಬ ಕ bronಖ ಡ ತ ಕಟ ಟ ತ ತ bro', 'shankaragouda desaigoudra super']


In [None]:
# FUNCTION CALL FOR DATA CLEANING AND PREPARATION #
utils = Utils()
X2,y_test=utils.readData2('kannada_sentiment_full_test_withlabels.tsv')

X_test=[]

for i in range(0, len(X2)):
    t = utils.cleanText(X2[i])
    t = utils.remove_emoticon(t)
    t = utils.remove_punc(t)
    t = utils.remove_stops(t)
    t = utils.lemmatization(t)
    X_test.append(t)
#print(corpus)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [None]:
line = ['', 'precision', 'recall', 'f1-score',  'support']
df = pd.DataFrame(line).transpose()
csv_data = df.to_csv(index=False)
df.to_csv('Kannada.csv',  mode='a', header=False, index=False)

In [None]:
# LOGISTIC REGRESSION #
lrp = Pipeline([('vect', CountVectorizer(min_df=3, max_df=0.2,analyzer='word', ngram_range=(1, 3))),
                ('tfidf', TfidfTransformer()),
                ('lr', LogisticRegression(max_iter=1000))
                ])
lrp.fit(X_train, y_train)
y_pred = lrp.predict(X_test)

utils.create_csv(y_pred, y_test, 'LR')

#prediction results
utils.draw_prediction_results(y_pred,y_test,ourTags,"Logistic Regression")

accuracy of Logistic Regression: 0.6158854166666666
                precision    recall  f1-score   support

   not-Kannada   0.444444  0.061538  0.108108        65
 unknown_state   0.642857  0.573248  0.606061       157
      Positive   0.627490  0.842246  0.719178       374
Mixed_feelings   0.597701  0.472727  0.527919       110
      Negative   0.400000  0.193548  0.260870        62

      accuracy                       0.615885       768
     macro avg   0.542499  0.428662  0.444427       768
  weighted avg   0.592508  0.615885  0.579943       768



In [None]:
# MULTINOMIAL NAIVE BAYES #
multinomial_naive_bayes = Pipeline([('vect', CountVectorizer(min_df=3, max_df=0.2, analyzer='word', ngram_range=(1, 3),)),
                        ('tfidf', TfidfTransformer()),
                        ('multinomial_naive_bayes',
                         MultinomialNB())
                        ])
multinomial_naive_bayes.fit(X_train, y_train)
y_pred = multinomial_naive_bayes.predict(X_test)

utils.create_csv(y_pred, y_test, 'MNB')

# prediction results
utils.draw_prediction_results(y_pred,y_test,ourTags,"Multinomial Naive Bayes")

accuracy of Multinomial Naive Bayes: 0.6171875
                precision    recall  f1-score   support

   not-Kannada   0.400000  0.030769  0.057143        65
 unknown_state   0.728070  0.528662  0.612546       157
      Positive   0.602862  0.901070  0.722401       374
Mixed_feelings   0.611111  0.400000  0.483516       110
      Negative   0.444444  0.129032  0.200000        62

      accuracy                       0.617188       768
     macro avg   0.557298  0.397907  0.415121       768
  weighted avg   0.599681  0.617188  0.567251       768



In [None]:
# LINEAR SVM #
linear_svm = Pipeline([('vect', CountVectorizer(min_df=3, max_df=0.2, analyzer='word', ngram_range=(1, 3),)),
                        ('tfidf', TfidfTransformer()),
                        ('linear_svc',
                        SVC(kernel='linear'))
                        ])
linear_svm.fit(X_train, y_train)
y_pred = linear_svm.predict(X_test)

utils.create_csv(y_pred, y_test, 'L-SVM')

# prediction results
utils.draw_prediction_results(y_pred,y_test,ourTags,"Linear SVM")

accuracy of Linear SVM: 0.6041666666666666
                precision    recall  f1-score   support

   not-Kannada   0.312500  0.076923  0.123457        65
 unknown_state   0.611111  0.560510  0.584718       157
      Positive   0.640167  0.818182  0.718310       374
Mixed_feelings   0.556701  0.490909  0.521739       110
      Negative   0.333333  0.177419  0.231579        62

      accuracy                       0.604167       768
     macro avg   0.490763  0.424789  0.435960       768
  weighted avg   0.569770  0.604167  0.573206       768



In [None]:
# RBF SVM #
rbf_svm = Pipeline([('vect', CountVectorizer(min_df=3, max_df=0.2, analyzer='word', ngram_range=(1, 3),)),
                        ('tfidf', TfidfTransformer()),
                        ('rbf_svc',
                        SVC(kernel='rbf', gamma=1))
                        ])
rbf_svm.fit(X_train, y_train)
y_pred = rbf_svm.predict(X_test)

utils.create_csv(y_pred, y_test, 'R-SVM')

# prediction results
utils.draw_prediction_results(y_pred,y_test,ourTags,"RBF SVM")

accuracy of RBF SVM: 0.6158854166666666
                precision    recall  f1-score   support

   not-Kannada   0.428571  0.046154  0.083333        65
 unknown_state   0.675439  0.490446  0.568266       157
      Positive   0.629703  0.850267  0.723549       374
Mixed_feelings   0.619565  0.518182  0.564356       110
      Negative   0.360000  0.290323  0.321429        62

      accuracy                       0.615885       768
     macro avg   0.542656  0.439074  0.452187       768
  weighted avg   0.598805  0.615885  0.582356       768



In [None]:
# POLY SVM #
poly_svm = Pipeline([('vect', CountVectorizer(min_df=3, max_df=0.2, analyzer='word', ngram_range=(1, 3),)),
                        ('tfidf', TfidfTransformer()),
                        ('poly_svc',
                        SVC(kernel='poly',degree = 1))
                        ])
poly_svm.fit(X_train, y_train)
y_pred = poly_svm.predict(X_test)

utils.create_csv(y_pred, y_test, 'P-SVM')

# prediction results
utils.draw_prediction_results(y_pred,y_test,ourTags,"POLY SVM")

accuracy of POLY SVM: 0.60546875
                precision    recall  f1-score   support

   not-Kannada   0.294118  0.076923  0.121951        65
 unknown_state   0.618056  0.566879  0.591362       157
      Positive   0.640167  0.818182  0.718310       374
Mixed_feelings   0.556701  0.490909  0.521739       110
      Negative   0.343750  0.177419  0.234043        62

      accuracy                       0.605469       768
     macro avg   0.490558  0.426062  0.437481       768
  weighted avg   0.570475  0.605469  0.574636       768



In [None]:
# RANDOM FOREST #
random_forest = Pipeline([('vect', CountVectorizer(min_df=3, max_df=0.2, analyzer='word', ngram_range=(1, 3),)),
                        ('tfidf', TfidfTransformer()),
                        ('random_forest',
                         RandomForestClassifier())
                        ])
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)

utils.create_csv(y_pred, y_test, 'RF')

# prediction results
utils.draw_prediction_results(y_pred,y_test,ourTags,"Random Forest")

accuracy of Random Forest: 0.5833333333333334
                precision    recall  f1-score   support

   not-Kannada   0.187500  0.046154  0.074074        65
 unknown_state   0.675000  0.515924  0.584838       157
      Positive   0.659674  0.756684  0.704857       374
Mixed_feelings   0.513043  0.536364  0.524444       110
      Negative   0.250000  0.354839  0.293333        62

      accuracy                       0.583333       768
     macro avg   0.457043  0.441993  0.436309       768
  weighted avg   0.568770  0.583333  0.567873       768



In [None]:
# KNeighborsClassifier #
knn = Pipeline([('vect', CountVectorizer(min_df=3, max_df=0.2, analyzer='word', ngram_range=(1, 3),)),
                        ('tfidf', TfidfTransformer()),
                        ('kNN', KNeighborsClassifier(n_neighbors=3))
                        ])
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

utils.create_csv(y_pred, y_test, 'kNN')

# prediction results
utils.draw_prediction_results(y_pred,y_test,ourTags,"KNeighborsClassifier")

accuracy of KNeighborsClassifier: 0.3776041666666667
                precision    recall  f1-score   support

   not-Kannada   0.142857  0.138462  0.140625        65
 unknown_state   0.621212  0.261146  0.367713       157
      Positive   0.725118  0.409091  0.523077       374
Mixed_feelings   0.191816  0.681818  0.299401       110
      Negative   0.324324  0.193548  0.242424        62

      accuracy                       0.377604       768
     macro avg   0.401066  0.336813  0.314648       768
  weighted avg   0.545857  0.377604  0.404254       768



In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier

In [None]:
# EXTRA TREE CLASSIFIER #
extra_tree = Pipeline([('vect', CountVectorizer(min_df=3, max_df=0.2, analyzer='word', ngram_range=(1, 3),)),
                        ('tfidf', TfidfTransformer()),
                        ('extra tree',
                         ExtraTreesClassifier())
                        ])
extra_tree.fit(X_train, y_train)
y_pred = extra_tree.predict(X_test)

utils.create_csv(y_pred, y_test, 'XTree')

# prediction results
utils.draw_prediction_results(y_pred,y_test,ourTags,"Extra Tree Classifier")

accuracy of Extra Tree Classifier: 0.5846354166666666
                precision    recall  f1-score   support

   not-Kannada   0.137931  0.061538  0.085106        65
 unknown_state   0.679688  0.554140  0.610526       157
      Positive   0.674699  0.748663  0.709759       374
Mixed_feelings   0.521739  0.545455  0.533333       110
      Negative   0.222222  0.290323  0.251748        62

      accuracy                       0.584635       768
     macro avg   0.447256  0.440024  0.438095       768
  weighted avg   0.571853  0.584635  0.574361       768



In [None]:
# VOTING CLASSIFIER #
# ALL CLASSIFIERS #

#create a dictionary of our models
estimators=[("Linear SVM", linear_svm), ("Logistic Regression", lrp), ("Multinomial Naive Bayes", multinomial_naive_bayes), ("Extra Tree", extra_tree), ("Random Forest", random_forest), ("Poly SVM", poly_svm), ("RBF SVM", rbf_svm), ("KNeighborsClassifier", knn)]

hard_ensemble = VotingClassifier(estimators, voting="hard")
hard_ensemble.fit(X_train, y_train)
y_pred = hard_ensemble.predict(X_test)

utils.create_csv(y_pred, y_test, 'HEns')

# prediction results
utils.draw_prediction_results(y_pred,y_test,ourTags,"Hard Ensemble")

accuracy of Hard Ensemble: 0.6119791666666666
                precision    recall  f1-score   support

   not-Kannada   0.300000  0.046154  0.080000        65
 unknown_state   0.649254  0.554140  0.597938       157
      Positive   0.621302  0.842246  0.715096       374
Mixed_feelings   0.580645  0.490909  0.532020       110
      Negative   0.458333  0.177419  0.255814        62

      accuracy                       0.611979       768
     macro avg   0.521907  0.422174  0.436174       768
  weighted avg   0.580843  0.611979  0.574095       768



In [None]:
# VOTING CLASSIFIER #
# TOP 5 CLASSIFIERS #

#create a dictionary of our models
estimators=[("Linear SVM", linear_svm), ("Logistic Regression", lrp),  ("Random Forest", random_forest), ("Poly SVM", poly_svm), ("RBF SVM", rbf_svm)]

hard_ensemble = VotingClassifier(estimators, voting="hard")
hard_ensemble.fit(X_train, y_train)
y_pred = hard_ensemble.predict(X_test)

utils.create_csv(y_pred, y_test, 'HEns5')

# prediction results
utils.draw_prediction_results(y_pred,y_test,ourTags,"Hard Ensemble")

accuracy of Hard Ensemble: 0.6080729166666666
                precision    recall  f1-score   support

   not-Kannada   0.384615  0.076923  0.128205        65
 unknown_state   0.630435  0.554140  0.589831       157
      Positive   0.628283  0.831551  0.715765       374
Mixed_feelings   0.569892  0.481818  0.522167       110
      Negative   0.379310  0.177419  0.241758        62

      accuracy                       0.608073       768
     macro avg   0.518507  0.424370  0.439545       768
  weighted avg   0.579637  0.608073  0.574297       768



In [None]:
# VOTING CLASSIFIER #
# TOP 3 CLASSIFIERS #

#create a dictionary of our models
estimators=[("Linear SVM", linear_svm), ("Logistic Regression", lrp), ("Poly SVM", poly_svm)]

hard_ensemble = VotingClassifier(estimators, voting="hard")
hard_ensemble.fit(X_train, y_train)
y_pred = hard_ensemble.predict(X_test)

utils.create_csv(y_pred, y_test, 'HEns3')

# prediction results
utils.draw_prediction_results(y_pred,y_test,ourTags,"Hard Ensemble")

accuracy of Hard Ensemble: 0.60546875
                precision    recall  f1-score   support

   not-Kannada   0.294118  0.076923  0.121951        65
 unknown_state   0.613793  0.566879  0.589404       157
      Positive   0.641509  0.818182  0.719154       374
Mixed_feelings   0.556701  0.490909  0.521739       110
      Negative   0.343750  0.177419  0.234043        62

      accuracy                       0.605469       768
     macro avg   0.489974  0.426062  0.437258       768
  weighted avg   0.570257  0.605469  0.574647       768



In [None]:
# VOTING CLASSIFIER #
# BEST OF ALL CLASSIFIERS #

#create a dictionary of our models
estimators=[("Linear SVM", linear_svm), ("Logistic Regression", lrp), ("Multinomial Naive Bayes", multinomial_naive_bayes), ("Extra Tree", extra_tree)]

hard_ensemble = VotingClassifier(estimators, voting="hard")
hard_ensemble.fit(X_train, y_train)
y_pred = hard_ensemble.predict(X_test)

utils.create_csv(y_pred, y_test, 'HEnsA')

# prediction results
utils.draw_prediction_results(y_pred,y_test,ourTags,"Hard Ensemble")

accuracy of Hard Ensemble: 0.6158854166666666
                precision    recall  f1-score   support

   not-Kannada   0.444444  0.061538  0.108108        65
 unknown_state   0.642857  0.573248  0.606061       157
      Positive   0.625247  0.847594  0.719637       374
Mixed_feelings   0.590909  0.472727  0.525253       110
      Negative   0.416667  0.161290  0.232558        62

      accuracy                       0.615885       768
     macro avg   0.544025  0.423280  0.438323       768
  weighted avg   0.591788  0.615885  0.577499       768



In [None]:
# ADABOOST #
#seed = 10
num_trees = 25

ada_boost = Pipeline([('vect', CountVectorizer(min_df=3, max_df=0.2, analyzer='word', ngram_range=(1, 3),)),
                        ('tfidf', TfidfTransformer()),
                        ('ada_boost',
                         AdaBoostClassifier(n_estimators=num_trees))
                        ])
ada_boost.fit(X_train, y_train)
y_pred = ada_boost.predict(X_test)

utils.create_csv(y_pred, y_test, 'AdaB')

# prediction results
utils.draw_prediction_results(y_pred,y_test,ourTags,"Ada Boost")

accuracy of Ada Boost: 0.5403645833333334
                precision    recall  f1-score   support

   not-Kannada   0.600000  0.046154  0.085714        65
 unknown_state   0.706897  0.261146  0.381395       157
      Positive   0.537500  0.919786  0.678501       374
Mixed_feelings   0.511628  0.200000  0.287582       110
      Negative   0.227273  0.080645  0.119048        62

      accuracy                       0.540365       768
     macro avg   0.516659  0.301546  0.310448       768
  weighted avg   0.548669  0.540365  0.466439       768



In [None]:
# XGBOOST #
xg_boost = Pipeline([('vect', CountVectorizer(min_df=3, max_df=0.2, analyzer='word', ngram_range=(1, 3),)),
                        ('tfidf', TfidfTransformer()),
                        ('xgboost',
                         XGBClassifier(learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 nthread=4,
 scale_pos_weight=1,
 seed=27))
])
xg_boost.fit(X_train, y_train)
y_pred = xg_boost.predict(X_test)

utils.create_csv(y_pred, y_test, 'XGB')

# prediction results
utils.draw_prediction_results(y_pred,y_test,ourTags,"XGBoost")

accuracy of XGBoost: 0.5690104166666666
                precision    recall  f1-score   support

   not-Kannada   0.259259  0.107692  0.152174        65
 unknown_state   0.622222  0.535032  0.575342       157
      Positive   0.628009  0.767380  0.690734       374
Mixed_feelings   0.495413  0.490909  0.493151       110
      Negative   0.125000  0.080645  0.098039        62

      accuracy                       0.569010       768
     macro avg   0.425981  0.396332  0.401888       768
  weighted avg   0.536017  0.569010  0.545416       768

