In [2]:
import pandas as pd
import re
from tqdm.notebook import tqdm
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neural_network import MLPClassifier

In [3]:
def write_result(file_name, preds):
    with open('../results/'+file_name, 'wt') as file:
        for i in preds:
            file.write(i+'\n')
    with open('../results/last_result.txt', 'wt') as file:
        for i in preds:
            file.write(i+'\n')
    !python ../NADI-2020_release_1.0/NADI_release/NADI-DID-Scorer.py ../tsv/gold1.txt ../results/last_result.txt

def preprocess_text(train_list, test_list):
    X_train_corrected_tweets = []
    for tweet in tqdm(train_list):
        new_tweet = re.findall( '[^A-Za-z:/_.0-9\\#@,=+\(\)]+' ,tweet)
        new_tweet = " ".join(new_tweet).replace('\xa0','').replace('\u200c','').replace('\U000fe329','').replace('\u2066','').replace('\u2069','').strip()
        X_train_corrected_tweets.append(new_tweet)

    X_dev_corrected_tweets = []
    for tweet in tqdm(test_list):
        new_tweet = re.findall( '[^A-Za-z:/_.0-9\\#@,=+\(\)]+' ,tweet) #[^\x00-\x19\x21-\x7F]+
        new_tweet = " ".join(new_tweet).replace('\xa0','').replace('\u200c','').replace('\U000fe329','').replace('\u2066','').replace('\u2069','').strip()
        X_dev_corrected_tweets.append(new_tweet)
    return X_train_corrected_tweets, X_dev_corrected_tweets

# Task1

# Unbalanced

In [4]:
train_df = pd.read_csv('../NADI-2020_release_1.0/NADI_release/train_labeled.tsv',sep='\t')
dev_df = pd.read_csv('../NADI-2020_release_1.0/NADI_release/dev_labeled.tsv',sep='\t')

X_train_original,y_train_original = train_df["#2 tweet_content"],train_df["#3 country_label"]
X_dev_original,y_dev_original = dev_df["#2 tweet_content"],dev_df["#3 country_label"]

X_train_corrected, X_dev_corrected = preprocess_text(X_train_original, X_dev_original)
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train_corrected)
X_dev = vectorizer.transform(X_dev_corrected)

HBox(children=(FloatProgress(value=0.0, max=21000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4957.0), HTML(value='')))




In [4]:
model = MLPClassifier((1024,),verbose=True,max_iter=50)
model.fit(X_train, y_train_original)
prediction = model.predict(X_dev)
print(prediction)
write_result('tfidf_task1_unbal.txt',prediction)

Iteration 1, loss = 2.53495277
Iteration 2, loss = 1.23892702
Iteration 3, loss = 0.27399245
Iteration 4, loss = 0.10015557
Iteration 5, loss = 0.06798964
Iteration 6, loss = 0.05801829
Iteration 7, loss = 0.05557359
Iteration 8, loss = 0.05018162
Iteration 9, loss = 0.04742837
Iteration 10, loss = 0.04619383
Iteration 11, loss = 0.04353102
Iteration 12, loss = 0.04288444
Iteration 13, loss = 0.04118913
Iteration 14, loss = 0.04012071
Iteration 15, loss = 0.03875515
Iteration 16, loss = 0.03861753
Iteration 17, loss = 0.03659112
Iteration 18, loss = 0.03613538
Iteration 19, loss = 0.03414700
Iteration 20, loss = 0.03421354
Iteration 21, loss = 0.03334296
Iteration 22, loss = 0.03361130
Iteration 23, loss = 0.03416203
Iteration 24, loss = 0.03310318
Iteration 25, loss = 0.03235669
['Iraq' 'Egypt' 'Algeria' ... 'Egypt' 'Egypt' 'Sudan']





OVERALL SCORES:
MACRO AVERAGE PRECISION SCORE: 14.75 %
MACRO AVERAGE RECALL SCORE: 14.13 %
MACRO AVERAGE F1 SCORE: 13.98 %
OVERALL ACCURACY: 28.42 %



# Balanced

In [5]:
train_df = pd.read_csv('../tsv/oversample_train.tsv',sep='\t')
dev_df = pd.read_csv('../NADI-2020_release_1.0/NADI_release/dev_labeled.tsv',sep='\t')

X_train_original,y_train_original = train_df["#2 tweet_content"],train_df["#3 country_label"]
X_dev_original,y_dev_original = dev_df["#2 tweet_content"],dev_df["#3 country_label"]

X_train_corrected, X_dev_corrected = preprocess_text(X_train_original, X_dev_original)
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train_corrected)
X_dev = vectorizer.transform(X_dev_corrected)

HBox(children=(FloatProgress(value=0.0, max=93933.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4957.0), HTML(value='')))




In [6]:
model = MLPClassifier((1024,),verbose=True,max_iter=25)
model.fit(X_train, y_train_original)
prediction = model.predict(X_dev)
print(prediction)
write_result('tfidf_task1_bal.txt',prediction)



['Kuwait' 'Kuwait' 'Somalia' ... 'Egypt' 'Bahrain' 'Djibouti']
  _warn_prf(average, modifier, msg_start, len(result))

OVERALL SCORES:
MACRO AVERAGE PRECISION SCORE: 11.46 %
MACRO AVERAGE RECALL SCORE: 8.39 %
MACRO AVERAGE F1 SCORE: 5.54 %
OVERALL ACCURACY: 6.76 %



# Task2

# Unbalanced

In [7]:
train_df = pd.read_csv('../NADI-2020_release_1.0/NADI_release/train_labeled.tsv',sep='\t')
dev_df = pd.read_csv('../NADI-2020_release_1.0/NADI_release/dev_labeled.tsv',sep='\t')

X_train_original,y_train_original = train_df["#2 tweet_content"],train_df["#4 province_label"]
X_dev_original,y_dev_original = dev_df["#2 tweet_content"],dev_df["#4 province_label"]

X_train_corrected, X_dev_corrected = preprocess_text(X_train_original, X_dev_original)
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train_corrected)
X_dev = vectorizer.transform(X_dev_corrected)

HBox(children=(FloatProgress(value=0.0, max=21000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4957.0), HTML(value='')))




In [8]:
model = MLPClassifier((1024,),verbose=True,max_iter=25)
model.fit(X_train, y_train_original)
prediction = model.predict(X_dev)
print(prediction)
write_result('tfidf_task2_unbal.txt',prediction)

ERROR:root:Internal Python error in the inspect module.
Below is the traceback from this internal error.



Traceback (most recent call last):
  File "/home/nikamanth/anaconda3/envs/torch/lib/python3.7/site-packages/IPython/core/interactiveshell.py", line 3331, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-8-e3a5f3c8898c>", line 3, in <module>
    prediction = model.predict(X_dev)
  File "/home/nikamanth/anaconda3/envs/torch/lib/python3.7/site-packages/sklearn/neural_network/_multilayer_perceptron.py", line 971, in predict
    y_pred = self._predict(X)
  File "/home/nikamanth/anaconda3/envs/torch/lib/python3.7/site-packages/sklearn/neural_network/_multilayer_perceptron.py", line 685, in _predict
    self._forward_pass(activations)
  File "/home/nikamanth/anaconda3/envs/torch/lib/python3.7/site-packages/sklearn/neural_network/_multilayer_perceptron.py", line 104, in _forward_pass
    self.coefs_[i])
  File "/home/nikamanth/anaconda3/envs/torch/lib/python3.7/site-packages/sklearn/utils/extmath.py", line 151, in safe_sparse_dot
    ret = a @ b
Keyboard

KeyboardInterrupt: 

# Balanced

In [None]:
train_df = pd.read_csv('../tsv/oversample_train.tsv',sep='\t')
dev_df = pd.read_csv('../NADI-2020_release_1.0/NADI_release/dev_labeled.tsv',sep='\t')

X_train_original,y_train_original = train_df["#2 tweet_content"],train_df["#4 province_label"]
X_dev_original,y_dev_original = dev_df["#2 tweet_content"],dev_df["#4 province_label"]

X_train_corrected, X_dev_corrected = preprocess_text(X_train_original, X_dev_original)
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train_corrected)
X_dev = vectorizer.transform(X_dev_corrected)

In [None]:
model = MLPClassifier((1024,),verbose=True,max_iter=25)
model.fit(X_train, y_train_original)
prediction = model.predict(X_dev)
print(prediction)
write_result('tfidf.txt_task2_bal',prediction)