In [None]:
import pandas as pd
import re
import nltk
import string
from nltk.corpus import stopwords
import spacy

from sklearn.metrics import accuracy_score,classification_report, f1_score
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

In [None]:
nlp = spacy.load('en', disable=['parser', 'ner'])

In [None]:
# DATA CLEANING AND PREPARATION #
class Utils(object):

    def cleanText(self, text):
        review = re.sub(r"^https://t.co/[a-zA-Z0-9]*\s", " ", str(text))
        review = re.sub(r"\([\s\S]*\)", " ", str(review))
        review = re.sub(r"\s+https://t.co/[a-zA-Z0-9]*\s", " ", str(review))
        review = re.sub(r"\s+https://t.co/[a-zA-Z0-9]*$", " ", str(review))
        review = review.lower()
        review = re.sub(r"that's", "that is", str(review))
        review = re.sub(r"there's", "there is", str(review))
        review = re.sub(r"what's", "what is", str(review))
        review = re.sub(r"where's", "where is", str(review))
        review = re.sub(r"it's", "it is", str(review))
        review = re.sub(r"who's", "who is", str(review))
        review = re.sub(r"i'm", "i am", str(review))
        review = re.sub(r"she's", "she is", str(review))
        review = re.sub(r"he's", "he is", str(review))
        review = re.sub(r"they're", "they are", str(review))
        review = re.sub(r"who're", "who are", str(review))
        review = re.sub(r"ain't", "am not", str(review))
        review = re.sub(r"wouldn't", "would not", str(review))
        review = re.sub(r"shouldn't", "should not", str(review))
        review = re.sub(r"can't", "can not", str(review))
        review = re.sub(r"couldn't", "could not", str(review))
        review = re.sub(r"won't", "will not", str(review))
        review = re.sub(r" pm ", " ", str(review))
        review = re.sub(r" am ", " ", str(review))
        review = re.sub(r'[^\[\]]+(?=\])', " ", str(review))
        review = re.sub(r"\W", " ", str(review))
        review = re.sub(r"\d", " ", str(review))
        review = re.sub(r"\s+[a-z]\s+", " ", str(review))
        review = re.sub(r"\s+[a-z]$", " ", str(review))
        review = re.sub(r"^[a-z]\s+", " ", str(review))
        review = re.sub(r"\s+", " ", str(review))
        return review

    def remove_punc(self, text):
        table = str.maketrans("", "", string.punctuation)
        return text.translate(table)

    def remove_emoticon(self, text):
        emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags 
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
        return emoji_pattern.sub(r'', text)


    
    def lemmatization(self, text):
        doc = nlp(text)
        return " ".join([token.lemma_ for token in doc])

    nltk.download('stopwords')
    def remove_stops(self, text):
        stop_words = set(nltk.corpus.stopwords.words('english'))
        text = [word.lower() for word in text.split() if word.lower() not in stop_words]
        return " ".join(text)


    def readData1(self, path, inputColumnIndex=0, outputColumnIndex=1):
        df = pd.read_csv(path, error_bad_lines=False, sep='\t')
        X = df.iloc[:, inputColumnIndex].values
        y = df.iloc[:, outputColumnIndex].values
        return X,y
    
    def readData2(self, path, inputColumnIndex=1, outputColumnIndex=2):
        df = pd.read_csv(path, error_bad_lines=False, sep='\t')
        X = df.iloc[:, inputColumnIndex].values
        y = df.iloc[:, outputColumnIndex].values
        return X,y
    
    def draw_prediction_results(self, y_pred, y_test, my_tags, method):
        print('accuracy of ' + method + ': %s' % accuracy_score(y_pred, y_test))
        print(classification_report(y_test, y_pred, target_names=my_tags, digits = 6))
    
    def create_csv(self, y_pred, y_test, classifier):
        report = classification_report(y_test, y_pred)
        lines = report.split('\n')
        line = lines[-2].split()
        line.remove('weighted')
        line[0] = classifier
        df = pd.DataFrame(line).transpose()
        csv_data = df.to_csv(index=False)
        df.to_csv('Tamil.csv',  mode='a', header=False, index=False)
        
    
    def crossValidation(self, prediction, input, output, k=5):
        scores = cross_val_score(prediction, input,output, cv=k)
        print("Accuracy of Cross Validation Mean: %0.6f (+/- %0.6f)" % (scores.mean(), scores.std() * 2))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# FUNCTION CALL FOR DATA CLEANING AND PREPARATION #
utils = Utils()
X1,y_train=utils.readData1('tamil_sentiment_full_train.tsv')

ourTags =['not-Tamil', 'unknown_state', 'Positive', 'Mixed_feelings', 'Negative']
X_train=[]

for i in range(0, len(X1)):
    t = utils.cleanText(X1[i])
    t = utils.remove_emoticon(t)
    t = utils.remove_punc(t)
    t = utils.remove_stops(t)
    t = utils.lemmatization(t)
    X_train.append(t)

print(X_train[:10])
#X_train, X_test, y_train, y_test = train_test_split(corpus, y, test_size=0.3, random_state=0)

['vani bhojam fan hit like solli like vangida vendiyathu', 'love ajith like', 'ennaya trailer ku mudi ellam nikkudhu vera level trailer', 'vijay annaa ur maassssss therrrrriiiiii', 'நம ப நட ந ச ம த ன ப ச ச', 'gommala end vera level da deii getrajinifie', 'vjs anna kaaga like potavanga like pannuga', 'theri semma theri joseph kuruvilla vijay kumar awesome kumar', 'ithu yethu maathiri illama puthu maathiyaala irukku', 'wow back baasha mode thalaivaaaa petta paraakkkkk']


In [None]:
# FUNCTION CALL FOR DATA CLEANING AND PREPARATION #
utils = Utils()
X2,y_test=utils.readData2('tamil_sentiment_full_test_withtlabels.tsv')

X_test=[]

for i in range(0, len(X2)):
    t = utils.cleanText(X2[i])
    t = utils.remove_emoticon(t)
    t = utils.remove_punc(t)
    t = utils.remove_stops(t)
    t = utils.lemmatization(t)
    X_test.append(t)
#print(corpus)

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC, SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [None]:
line = ['', 'precision', 'recall', 'f1-score',  'support']
df = pd.DataFrame(line).transpose()
csv_data = df.to_csv(index=False)
df.to_csv('Tamil.csv',  mode='a', header=False, index=False)

In [None]:
# LOGISTIC REGRESSION #
lrp = Pipeline([('vect', CountVectorizer(min_df=3, max_df=0.2,analyzer='word', ngram_range=(1, 3))),
                ('tfidf', TfidfTransformer()),
                ('lr', LogisticRegression(max_iter=1000))
                ])
lrp.fit(X_train, y_train)
y_pred = lrp.predict(X_test)

# prediction results
#utils.draw_prediction_results(y_pred,y_test,ourTags,"Logistic Regression")

utils.create_csv(y_pred, y_test, 'LR')

#prediction results
utils.draw_prediction_results(y_pred,y_test,ourTags,"Logistic Regression")

accuracy of Logistic Regression: 0.6442526124488869
                precision    recall  f1-score   support

     not-Tamil   0.358108  0.112766  0.171521       470
 unknown_state   0.477663  0.291405  0.361979       477
      Positive   0.683634  0.907306  0.779747      2546
Mixed_feelings   0.757812  0.397541  0.521505       244
      Negative   0.519737  0.356391  0.422837       665

      accuracy                       0.644253      4402
     macro avg   0.559391  0.413082  0.451518      4402
  weighted avg   0.605911  0.644253  0.601306      4402



In [None]:
# MULTINOMIAL NAIVE BAYES #
multinomial_naive_bayes = Pipeline([('vect', CountVectorizer(min_df=3, max_df=0.2, analyzer='word', ngram_range=(1, 3),)),
                        ('tfidf', TfidfTransformer()),
                        ('multinomial_naive_bayes',
                         MultinomialNB())
                        ])
multinomial_naive_bayes.fit(X_train, y_train)
y_pred = multinomial_naive_bayes.predict(X_test)

utils.create_csv(y_pred, y_test, 'MNB')

# prediction results
utils.draw_prediction_results(y_pred,y_test,ourTags,"Multinomial Naive Bayes")

accuracy of Multinomial Naive Bayes: 0.6256247160381645
                precision    recall  f1-score   support

     not-Tamil   0.437500  0.029787  0.055777       470
 unknown_state   0.500000  0.132075  0.208955       477
      Positive   0.627706  0.968185  0.761625      2546
Mixed_feelings   0.839286  0.385246  0.528090       244
      Negative   0.575610  0.177444  0.271264       665

      accuracy                       0.625625      4402
     macro avg   0.596020  0.338548  0.365142      4402
  weighted avg   0.597417  0.625625  0.539352      4402



In [None]:
# LINEAR SVM #
linear_svm = Pipeline([('vect', CountVectorizer(min_df=3, max_df=0.2, analyzer='word', ngram_range=(1, 3),)),
                        ('tfidf', TfidfTransformer()),
                        ('linear_svc',
                        SVC(kernel='linear'))
                        ])
linear_svm.fit(X_train, y_train)
y_pred = linear_svm.predict(X_test)

utils.create_csv(y_pred, y_test, 'L-SVM')

# prediction results
utils.draw_prediction_results(y_pred,y_test,ourTags,"Linear SVM")

accuracy of Linear SVM: 0.6367560199909132
                precision    recall  f1-score   support

     not-Tamil   0.389706  0.112766  0.174917       470
 unknown_state   0.460317  0.303983  0.366162       477
      Positive   0.678603  0.893166  0.771240      2546
Mixed_feelings   0.743243  0.450820  0.561224       244
      Negative   0.488938  0.332331  0.395703       665

      accuracy                       0.636756      4402
     macro avg   0.552162  0.418613  0.453849      4402
  weighted avg   0.599035  0.636756  0.595304      4402



In [None]:
# RBF SVM #
rbf_svm = Pipeline([('vect', CountVectorizer(min_df=3, max_df=0.2, analyzer='word', ngram_range=(1, 3),)),
                        ('tfidf', TfidfTransformer()),
                        ('rbf_svc',
                        SVC(kernel='rbf', gamma=1))
                        ])
rbf_svm.fit(X_train, y_train)
y_pred = rbf_svm.predict(X_test)

utils.create_csv(y_pred, y_test, 'R-SVM')

# prediction results
utils.draw_prediction_results(y_pred,y_test,ourTags,"RBF SVM")

accuracy of RBF SVM: 0.6369831894593366
                precision    recall  f1-score   support

     not-Tamil   0.500000  0.061702  0.109848       470
 unknown_state   0.481651  0.220126  0.302158       477
      Positive   0.650315  0.932050  0.766102      2546
Mixed_feelings   0.746667  0.459016  0.568528       244
      Negative   0.565749  0.278195  0.372984       665

      accuracy                       0.636983      4402
     macro avg   0.588876  0.390218  0.423924      4402
  weighted avg   0.608555  0.636983  0.575422      4402



In [None]:
# POLY SVM #
poly_svm = Pipeline([('vect', CountVectorizer(min_df=3, max_df=0.2, analyzer='word', ngram_range=(1, 3),)),
                        ('tfidf', TfidfTransformer()),
                        ('poly_svc',
                        SVC(kernel='poly',degree = 1))
                        ])
poly_svm.fit(X_train, y_train)
y_pred = poly_svm.predict(X_test)

utils.create_csv(y_pred, y_test, 'P-SVM')

# prediction results
utils.draw_prediction_results(y_pred,y_test,ourTags,"POLY SVM")

accuracy of POLY SVM: 0.6365288505224898
                precision    recall  f1-score   support

     not-Tamil   0.384058  0.112766  0.174342       470
 unknown_state   0.460317  0.303983  0.366162       477
      Positive   0.678710  0.892773  0.771162      2546
Mixed_feelings   0.743243  0.450820  0.561224       244
      Negative   0.488938  0.332331  0.395703       665

      accuracy                       0.636529      4402
     macro avg   0.551053  0.418535  0.453719      4402
  weighted avg   0.598494  0.636529  0.595198      4402



In [None]:
# RANDOM FOREST #
random_forest = Pipeline([('vect', CountVectorizer(min_df=3, max_df=0.2, analyzer='word', ngram_range=(1, 3),)),
                        ('tfidf', TfidfTransformer()),
                        ('random_forest',
                         RandomForestClassifier())
                        ])
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)

utils.create_csv(y_pred, y_test, 'RF')

# prediction results
utils.draw_prediction_results(y_pred,y_test,ourTags,"Random Forest")

accuracy of Random Forest: 0.6342571558382554
                precision    recall  f1-score   support

     not-Tamil   0.372549  0.040426  0.072937       470
 unknown_state   0.469880  0.245283  0.322314       477
      Positive   0.654561  0.927337  0.767431      2546
Mixed_feelings   0.642105  0.500000  0.562212       244
      Negative   0.567213  0.260150  0.356701       665

      accuracy                       0.634257      4402
     macro avg   0.541261  0.394639  0.416319      4402
  weighted avg   0.590553  0.634257  0.571624      4402



In [None]:
# KNeighborsClassifier #
knn = Pipeline([('vect', CountVectorizer(min_df=3, max_df=0.2, analyzer='word', ngram_range=(1, 3),)),
                        ('tfidf', TfidfTransformer()),
                        ('kNN', KNeighborsClassifier(n_neighbors=3))
                        ])
knn.fit(X_train, y_train)
y_pred = knn.predict(X_test)

utils.create_csv(y_pred, y_test, 'kNN')

# prediction results
utils.draw_prediction_results(y_pred,y_test,ourTags,"KNeighborsClassifier")

accuracy of KNeighborsClassifier: 0.5615629259427533
                precision    recall  f1-score   support

     not-Tamil   0.114679  0.053191  0.072674       470
 unknown_state   0.219731  0.102725  0.140000       477
      Positive   0.613517  0.873527  0.720791      2546
Mixed_feelings   0.400000  0.204918  0.271003       244
      Negative   0.587678  0.186466  0.283105       665

      accuracy                       0.561563      4402
     macro avg   0.387121  0.284166  0.297515      4402
  weighted avg   0.501847  0.561563  0.497606      4402



In [None]:
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import AdaBoostClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier

In [None]:
# EXTRA TREE CLASSIFIER #
extra_tree = Pipeline([('vect', CountVectorizer(min_df=3, max_df=0.2, analyzer='word', ngram_range=(1, 3),)),
                        ('tfidf', TfidfTransformer()),
                        ('extra tree',
                         ExtraTreesClassifier())
                        ])
extra_tree.fit(X_train, y_train)
y_pred = extra_tree.predict(X_test)

utils.create_csv(y_pred, y_test, 'XTree')

# prediction results
utils.draw_prediction_results(y_pred,y_test,ourTags,"Extra Tree Classifier")

accuracy of Extra Tree Classifier: 0.6310767832803271
                precision    recall  f1-score   support

     not-Tamil   0.305556  0.046809  0.081181       470
 unknown_state   0.452899  0.262055  0.332005       477
      Positive   0.662944  0.909269  0.766810      2546
Mixed_feelings   0.613861  0.508197  0.556054       244
      Negative   0.533333  0.288722  0.374634       665

      accuracy                       0.631077      4402
     macro avg   0.513719  0.403010  0.422137      4402
  weighted avg   0.579725  0.631077  0.575563      4402



In [None]:
# VOTING CLASSIFIER #
# ALL CLASSIFIERS #

#create a dictionary of our models
estimators=[("Linear SVM", linear_svm), ("Logistic Regression", lrp), ("Multinomial Naive Bayes", multinomial_naive_bayes), ("Extra Tree", extra_tree), ("Random Forest", random_forest), ("Poly SVM", poly_svm), ("RBF SVM", rbf_svm), ("KNeighborsClassifier", knn)]

hard_ensemble = VotingClassifier(estimators, voting="hard")
hard_ensemble.fit(X_train, y_train)
y_pred = hard_ensemble.predict(X_test)

utils.create_csv(y_pred, y_test, 'HEns')

# prediction results
utils.draw_prediction_results(y_pred,y_test,ourTags,"Hard Ensemble")

accuracy of Hard Ensemble: 0.6367560199909132
                precision    recall  f1-score   support

     not-Tamil   0.465753  0.072340  0.125230       470
 unknown_state   0.487603  0.247379  0.328234       477
      Positive   0.650082  0.934014  0.766602      2546
Mixed_feelings   0.795455  0.430328  0.558511       244
      Negative   0.565657  0.252632  0.349272       665

      accuracy                       0.636756      4402
     macro avg   0.592910  0.387339  0.425570      4402
  weighted avg   0.608099  0.636756  0.576042      4402



In [None]:
# VOTING CLASSIFIER #
# TOP 5 CLASSIFIERS #

#create a dictionary of our models
estimators=[("Linear SVM", linear_svm), ("Logistic Regression", lrp),  ("Random Forest", random_forest), ("Poly SVM", poly_svm), ("RBF SVM", rbf_svm)]

hard_ensemble = VotingClassifier(estimators, voting="hard")
hard_ensemble.fit(X_train, y_train)
y_pred = hard_ensemble.predict(X_test)

utils.create_csv(y_pred, y_test, 'HEns5')

# prediction results
utils.draw_prediction_results(y_pred,y_test,ourTags,"Hard Ensemble")

accuracy of Hard Ensemble: 0.641753748296229
                precision    recall  f1-score   support

     not-Tamil   0.429825  0.104255  0.167808       470
 unknown_state   0.483986  0.285115  0.358839       477
      Positive   0.670809  0.911626  0.772894      2546
Mixed_feelings   0.773723  0.434426  0.556430       244
      Negative   0.519512  0.320301  0.396279       665

      accuracy                       0.641754      4402
     macro avg   0.575571  0.411145  0.450450      4402
  weighted avg   0.607684  0.641754  0.594529      4402



In [None]:
# VOTING CLASSIFIER #
# TOP 3 CLASSIFIERS #

#create a dictionary of our models
estimators=[("Linear SVM", linear_svm), ("Logistic Regression", lrp), ("Poly SVM", poly_svm)]

hard_ensemble = VotingClassifier(estimators, voting="hard")
hard_ensemble.fit(X_train, y_train)
y_pred = hard_ensemble.predict(X_test)

utils.create_csv(y_pred, y_test, 'HEns3')

# prediction results
utils.draw_prediction_results(y_pred,y_test,ourTags,"Hard Ensemble")

accuracy of Hard Ensemble: 0.6367560199909132
                precision    recall  f1-score   support

     not-Tamil   0.386861  0.112766  0.174629       470
 unknown_state   0.460317  0.303983  0.366162       477
      Positive   0.678806  0.893166  0.771370      2546
Mixed_feelings   0.743243  0.450820  0.561224       244
      Negative   0.488938  0.332331  0.395703       665

      accuracy                       0.636756      4402
     macro avg   0.551633  0.418613  0.453818      4402
  weighted avg   0.598849  0.636756  0.595349      4402



In [None]:
# VOTING CLASSIFIER #
# BEST OF ALL CLASSIFIERS #

#create a dictionary of our models
estimators=[("Linear SVM", linear_svm), ("Logistic Regression", lrp), ("Multinomial Naive Bayes", multinomial_naive_bayes), ("Extra Tree", extra_tree)]

hard_ensemble = VotingClassifier(estimators, voting="hard")
hard_ensemble.fit(X_train, y_train)
y_pred = hard_ensemble.predict(X_test)

utils.create_csv(y_pred, y_test, 'HEnsA')

# prediction results
utils.draw_prediction_results(y_pred,y_test,ourTags,"Hard Ensemble")

accuracy of Hard Ensemble: 0.6406179009541118
                precision    recall  f1-score   support

     not-Tamil   0.428571  0.102128  0.164948       470
 unknown_state   0.491289  0.295597  0.369110       477
      Positive   0.658863  0.928515  0.770786      2546
Mixed_feelings   0.790323  0.401639  0.532609       244
      Negative   0.580756  0.254135  0.353556       665

      accuracy                       0.640618      4402
     macro avg   0.589960  0.396403  0.438202      4402
  weighted avg   0.611604  0.640618  0.586343      4402



In [None]:
# ADABOOST #
#seed = 10
num_trees = 25

ada_boost = Pipeline([('vect', CountVectorizer(min_df=3, max_df=0.2, analyzer='word', ngram_range=(1, 3),)),
                        ('tfidf', TfidfTransformer()),
                        ('ada_boost',
                         AdaBoostClassifier(n_estimators=num_trees))
                        ])
ada_boost.fit(X_train, y_train)
y_pred = ada_boost.predict(X_test)

utils.create_csv(y_pred, y_test, 'AdaB')

# prediction results
utils.draw_prediction_results(y_pred,y_test,ourTags,"Ada Boost")

accuracy of Ada Boost: 0.5860972285324852
                precision    recall  f1-score   support

     not-Tamil   0.000000  0.000000  0.000000       470
 unknown_state   0.000000  0.000000  0.000000       477
      Positive   0.608178  0.923016  0.733229      2546
Mixed_feelings   0.654206  0.286885  0.398860       244
      Negative   0.372093  0.240602  0.292237       665

      accuracy                       0.586097      4402
     macro avg   0.326895  0.290101  0.284865      4402
  weighted avg   0.444227  0.586097  0.490337      4402



  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# XGBOOST #
xg_boost = Pipeline([('vect', CountVectorizer(min_df=3, max_df=0.2, analyzer='word', ngram_range=(1, 3),)),
                        ('tfidf', TfidfTransformer()),
                        ('xgboost',
                         XGBClassifier(learning_rate =0.1,
 n_estimators=1000,
 max_depth=5,
 min_child_weight=1,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 nthread=4,
 scale_pos_weight=1,
 seed=27))
])
xg_boost.fit(X_train, y_train)
y_pred = xg_boost.predict(X_test)

utils.create_csv(y_pred, y_test, 'XGB')

# prediction results
utils.draw_prediction_results(y_pred,y_test,ourTags,"XGBoost")

accuracy of XGBoost: 0.6285779191276692
                precision    recall  f1-score   support

     not-Tamil   0.324138  0.100000  0.152846       470
 unknown_state   0.460145  0.266247  0.337317       477
      Positive   0.673015  0.902200  0.770935      2546
Mixed_feelings   0.658065  0.418033  0.511278       244
      Negative   0.469734  0.291729  0.359926       665

      accuracy                       0.628578      4402
     macro avg   0.517019  0.395642  0.426460      4402
  weighted avg   0.581161  0.628578  0.581472      4402

