# Importing required libraries

In [1]:
import pandas as pd
import numpy as np
from glob import glob
import re
import json
import sys

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelEncoder
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV

from tensorflow import keras
from keras import Sequential
from keras.layers import Dense, Dropout
from keras.utils import to_categorical

from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
import stemmer as hindi_stemmer

In [2]:
english_stopwords = stopwords.words("english")

with open('final_stopwords.txt', encoding='utf-8') as f:
    hindi_stopwords = f.readlines()
    for i in range(len(hindi_stopwords)):
        hindi_stopwords[i] = re.sub('\n', '', hindi_stopwords[i])

stopword = english_stopwords + hindi_stopwords
english_stemmer = SnowballStemmer("english")

In [3]:
train_directories = []
for i in glob("testdata/train/*/*/"):
    for j in glob(i+'*/'):
        train_directories.append(j)

In [4]:
train_directories

['testdata/train\\German\\Anit-Vaxing\\1480483303253680129\\',
 'testdata/train\\German\\Anti-Gender Language\\1383343760260567043\\',
 'testdata/train\\German\\Antisemitismus\\1366314282665844739\\',
 'testdata/train\\German\\Corona\\1467795022854639621\\',
 'testdata/train\\German\\Corona\\1471440748797169668\\',
 'testdata/train\\German\\corona measures\\1530446777681424385\\',
 'testdata/train\\German\\Demonstrations\\1479045656933244932\\',
 'testdata/train\\German\\Fatshaming\\1470487489982963722\\',
 'testdata/train\\German\\Fatshaming\\1486792911492726785\\',
 'testdata/train\\German\\Female Politician\\1467800206385324038\\',
 'testdata/train\\German\\foreigners and crime\\1527188535593385984\\',
 'testdata/train\\German\\hijab comedy\\1534472301269377024\\',
 'testdata/train\\German\\LGBT\\1526193028309864448\\',
 'testdata/train\\German\\male violence\\1534935096582721541\\',
 'testdata/train\\German\\male violence\\1534939464199454720\\',
 'testdata/train\\German\\Misogyny\

In [5]:
data = []
for i in train_directories:
    try:
        with open(i+'data.json', encoding='utf-8') as f:
            data.append(json.load(f))
    except:
        continue

binary_labels = []
for i in train_directories:
    if('Hinglish' in i):
        with open(i+'binary_labels.json', encoding='utf-8') as f:
            binary_labels.append(json.load(f))
    else:
        try:
            with open(i+'labels.json', encoding='utf-8') as f:
                binary_labels.append(json.load(f))
        except:
            continue

In [6]:
def tr_flatten(d,l):
    flat_text = []
    flat_text.append({
        'tweet_id':d['tweet_id'],
        'text':d['tweet'],
        'label':l[d['tweet_id']]
    })

    for i in d['comments']:
            flat_text.append({
                'tweet_id':i['tweet_id'],
                'text':flat_text[0]['text'] +' '+i['tweet'], 
                'label':l[i['tweet_id']]
            })
            if 'replies' in i.keys():
                for j in i['replies']:
                    flat_text.append({
                        'tweet_id':j['tweet_id'],
                        'text':flat_text[0]['text'] +' '+ i['tweet'] +' '+ j['tweet'], 
                        'label':l[j['tweet_id']]
                    })
    return flat_text

def te_flatten(d):
    flat_text = []
    flat_text.append({
        'tweet_id':d['tweet_id'],
        'text':d['tweet'],
    })

    for i in d['comments']:
            flat_text.append({
                'tweet_id':i['tweet_id'],
                'text':flat_text[0]['text'] + i['tweet'],
            })
            if 'replies' in i.keys():
                for j in i['replies']:
                    flat_text.append({
                        'tweet_id':j['tweet_id'],
                        'text':flat_text[0]['text'] + i['tweet'] + j['tweet'],
                    })
    return flat_text

In [7]:
data_label = []
#for train
for i in range(len(binary_labels)):
    for j in tr_flatten(data[i], binary_labels[i]):
        data_label.append(j)
train_len = len(data_label)

In [8]:
df = pd.DataFrame(data_label, columns = data_label[0].keys(), index = None)

In [9]:
df.head()

Unnamed: 0,tweet_id,text,label
0,1480483303253680129,Ein HNO-#Arzt aus #Weilheim geht juristisch ge...,NONE
1,1480483587866742792,Ein HNO-#Arzt aus #Weilheim geht juristisch ge...,NONE
2,1480483755055845376,Ein HNO-#Arzt aus #Weilheim geht juristisch ge...,NONE
3,1480486718864502787,Ein HNO-#Arzt aus #Weilheim geht juristisch ge...,NONE
4,1480484022132301829,Ein HNO-#Arzt aus #Weilheim geht juristisch ge...,NONE


In [10]:
df.loc[df['label'] == 'NONE'] = 'NOT'
df['label'].value_counts()

HOF    2612
NOT    2609
Name: label, dtype: int64

In [11]:
tweets = df.text
y = df.label

In [12]:
regex_for_english_hindi_emojis="[^a-zA-Z#\U0001F300-\U0001F5FF'|'\U0001F600-\U0001F64F'|'\U0001F680-\U0001F6FF'|'\u2600-\u26FF\u2700-\u27BF\u0900-\u097F]"
def clean_tweet(tweet, english_stemmer, stopword):
    tweet = re.sub(r"@[A-Za-z0-9]+", ' ', tweet)
    tweet = re.sub(r"https?://[A-Za-z0-9./]+", ' ', tweet)
    tweet = re.sub(regex_for_english_hindi_emojis, ' ', tweet)
    tweet = re.sub("RT ", " ", tweet)
    tweet = re.sub("\n", " ", tweet)
    tweet = re.sub(r" +", " ", tweet)
    tokens = []
    for token in tweet.split():
        if token not in stopword:
            token = english_stemmer.stem(token)
            token = hindi_stemmer.hi_stem(token)
            tokens.append(token)
    return " ".join(tokens)

In [13]:
cleaned_tweets = [clean_tweet(
        tweet, english_stemmer, stopword) for tweet in tweets]

In [14]:
vectorizer = TfidfVectorizer(min_df = 5)
X = vectorizer.fit_transform(cleaned_tweets)
X = X.todense()

In [15]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.1, random_state=50)

In [16]:
classifier = LogisticRegression()
classifier.fit(X_train, y_train)



In [17]:
y_pred = classifier.predict(X_val)



In [18]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

         HOF       0.71      0.81      0.75       240
         NOT       0.82      0.72      0.76       283

    accuracy                           0.76       523
   macro avg       0.76      0.76      0.76       523
weighted avg       0.77      0.76      0.76       523



In [19]:
le = LabelEncoder()
y_train = le.fit_transform(y_train)
y_val = le.transform(y_val)

In [20]:
model = Sequential(
    [
        Dense(64, activation="relu"),
        Dense(32, activation="relu"),
        Dense(1, activation="sigmoid"),
    ]
)
model.compile('adam', loss='binary_crossentropy', metrics = ['accuracy'])

In [21]:
model.fit(X_train, y_train, epochs = 5, batch_size = 32)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x2521b80bd90>

In [22]:
y_pred = model.predict(X_val)
y_pred = (y_pred > 0.4).astype('int64')
y_pred = y_pred.reshape(len(y_pred))    



In [23]:
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

           0       0.73      0.80      0.76       240
           1       0.81      0.75      0.78       283

    accuracy                           0.77       523
   macro avg       0.77      0.77      0.77       523
weighted avg       0.77      0.77      0.77       523



In [24]:
test_directories = []
for i in glob("testdata/test/*/*/"):
    for j in glob(i+'*/'):
        test_directories.append(j)

In [25]:
test_directories

['testdata/test\\German\\Corona\\1530498233398607873\\',
 'testdata/test\\German\\green party\\1534603102179016708\\',
 'testdata/test\\German\\Presiden\\1486034666968731655\\',
 'testdata/test\\German\\Rassismus\\1367179784372047876\\',
 'testdata/test\\Hinglish\\celebrity_controversies\\1425321569350414343\\',
 'testdata/test\\Hinglish\\celebrity_controversies\\1438882238087659525\\',
 'testdata/test\\Hinglish\\farmer_protest\\1480518248076509184\\',
 'testdata/test\\Hinglish\\hinduphobia\\1445930336039358469\\',
 'testdata/test\\Hinglish\\hinduphobia\\1467895004223791105\\',
 'testdata/test\\Hinglish\\hinduphobia\\1470652707824291843\\',
 'testdata/test\\Hinglish\\historical_hindu_muslim\\1445435933214617602\\',
 'testdata/test\\Hinglish\\islamophobia\\1442176510224261120\\',
 'testdata/test\\Hinglish\\islamophobia\\1533444368690032641\\',
 'testdata/test\\Hinglish\\ozil\\1438762071835951104\\',
 'testdata/test\\Hinglish\\russia_ukarain_conflict\\1497413697056215043\\',
 'testdata/t

In [26]:
test_data = []
for i in test_directories:
    with open(i+'data.json', encoding='utf-8') as f:
        data.append(json.load(f))

In [27]:
test_tweetid_data = []
#for test
for i in range(len(binary_labels), len(data)):
    for j in te_flatten(data[i]):
        test_tweetid_data.append(j)

In [28]:
test_df = pd.DataFrame(test_tweetid_data, columns = test_tweetid_data[0].keys(), index = None)

In [29]:
test_df.head()

Unnamed: 0,tweet_id,text
0,1530498233398607873,Die Protagonisten der letzten 2 Jahre fordern ...
1,1530498806164365318,Die Protagonisten der letzten 2 Jahre fordern ...
2,1530499829901807621,Die Protagonisten der letzten 2 Jahre fordern ...
3,1530507651326611457,Die Protagonisten der letzten 2 Jahre fordern ...
4,1530531871037263873,Die Protagonisten der letzten 2 Jahre fordern ...


In [30]:
test_tweets = test_df.text
tweet_ids = test_df.tweet_id

In [31]:
cleaned_test = [clean_tweet(
        tweet, english_stemmer, stopword) for tweet in test_tweets]

In [32]:
X_test = vectorizer.transform(cleaned_test)
X_test = X_test.todense()

In [33]:
submission_prediction = classifier.predict(X_test)
submission = {'tweet_id': tweet_ids, 'label':submission_prediction}
submission = pd.DataFrame(submission)



In [34]:
from msilib import schema


submission.to_json('testdata/submission/submission.json',orient='table',index=False)