In [None]:
import numpy as np
import pandas as pd

"Irrelevant" class - raw data & pre-processing

In [None]:
#Reading the csv files containing tweets for "Irrelevant" class. 

ibb_anlamsiz = pd.read_csv('/data/ibb_anlamsiz.csv', encoding = 'cp1254')
ibb_siyasi = pd.read_csv('/data/ibb_siyasi.csv', encoding = 'cp1254')

In [None]:
!pip install TurkishStemmer

Collecting TurkishStemmer
  Downloading https://files.pythonhosted.org/packages/fd/bf/3e56dd4ce442f9237e1c202ce736ae5e5818d74f81604f1665e67736cfc0/TurkishStemmer-1.3-py3-none-any.whl
Installing collected packages: TurkishStemmer
Successfully installed TurkishStemmer-1.3


In [None]:
import nltk
from TurkishStemmer import TurkishStemmer
import re
from sklearn.feature_extraction.text import CountVectorizer

nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [None]:
"""This is the preprocessing for tweets. We have additional processes such as removal of incomplete words at the end of tweets, usernames 
and RT (denoting retweets) to remove extra evidence about irrelevant tweets that are not related to the content."""

porter = TurkishStemmer()

stop_words = nltk.corpus.stopwords.words('turkish')

def clean_text(string):

    message = re.sub(r'\w+\…'," ", string) #Removal of incomplete words
    message = re.sub(r'@\w+(:)?'," ", message) #Removal of Twitter user names
    message = re.sub(r'\ART', " ", message) #Removal of RT
    message = re.sub(r'\b[\w\-.]+?@\w+?\.\w{2,4}\b', 'emailaddr', message)
    message = re.sub(r'(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)', ' ', #Replace URLs with space because it might be too freq in this class
                     message)
    message = re.sub(r'₺|\$', 'money', message) #Replace money symbols with 'money'
    message = re.sub(
        r'\b(\+\d{1,2}\s)?\d?[\-(.]?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b', #Replace phone numbers with 'phonenumbr'
        'phonenumbr', message)
    message = re.sub(r'\d+(\.\d+)?', 'numbr', message)  #Replace numbers with 'numbr'
    message = re.sub(r'[^\w\d\s]', ' ', message) #Punctuation removed
    message = re.sub(r'\s+', ' ', message) #Too much space replaced by single space
    message = re.sub(r'^\s+|\s+?$', '', message.lower()) #Get rid of spaces at the beginning and at the end.
    return ' '.join(
    porter.stem(term)
    for term in message.split()
    if term not in set(stop_words)
    )

In [None]:
"""Creating the dataset for tweets labelled as 'Irrelevant'. We apply pre-processing, arrange the columns
and add the label "4" to denote 'Irrelevant'."""

ibb_ilgisiz = pd.concat([ibb_anlamsiz, ibb_siyasi], axis = 0)

textCopy = ibb_ilgisiz['Text']
textCopy = textCopy.apply(clean_text)
ibb_ilgisiz["Text"] = textCopy
ibb_ilgisiz.reset_index(inplace = True)
ibb_ilgisiz.drop(['index', 'Author', 'Date'], axis = 1, inplace = True)
ibb_ilgisiz['Label'] = 4
ibb_ilgisiz.shape

(2713, 2)

In [None]:
#Train-validation and test split for 'Irrelevant' dataset. 

from sklearn.model_selection import train_test_split

tr_irrel_text, tst_irrel_text, tr_irrel_lbl, tst_irrel_lbl = train_test_split(ibb_ilgisiz.drop('Label', axis = 1), ibb_ilgisiz['Label'], test_size = 0.2, random_state = 42)

tr_irrel_text, val_irrel_text, tr_irrel_lbl, val_irrel_lbl = train_test_split(tr_irrel_text, tr_irrel_lbl, test_size = 0.2, random_state = 42)

Municipality's enterprises data - raw data & pre-processing

In [None]:
#Reading the files that contain complaints to municipality's enterprises. We already had them as split into train-valid-test sets. 

tr_organisations = pd.read_csv('/data/ibb_organisations_train.csv')
val_organisations = pd.read_csv('/data/ibb_organisations_valid.csv')
tst_organisations = pd.read_csv('/data/ibb_organisations_test.csv')

In [None]:
#Preprocessing procedure for complaints to municipality's enterprises

porter = TurkishStemmer()

stop_words = nltk.corpus.stopwords.words('turkish')

def clean_text2(string):
    message = re.sub(r'\b[\w\-.]+?@\w+?\.\w{2,4}\b', 'emailaddr', str(string))
    message = re.sub(r'(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)', ' ', #Replace URLs with ' '
                     message)
    message = re.sub(r'₺|\$', 'money', message) #Replace money symbols with 'money'
    message = re.sub(
        r'\b(\+\d{1,2}\s)?\d?[\-(.]?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b', #Replace phone numbers with 'phonenumbr'
        'phonenumbr', message)
    message = re.sub(r'\d+(\.\d+)?', 'numbr', message)  #Replace numbers with 'numbr'
    message = re.sub(r'[^\w\d\s]', ' ', message)
    message = re.sub(r'\s+', ' ', message)
    message = re.sub(r'^\s+|\s+?$', '', message.lower())
    return ' '.join(
    porter.stem(term)
    for term in message.split()
    if term not in set(stop_words)
    )

In [None]:
#Pre-processing implemented on train-valid-test data sets of municipality's enterprises. 

textCopy = tr_organisations['comment']
textCopy = textCopy.apply(clean_text2)
tr_organisations["comment"] = textCopy

textCopy1 = val_organisations['comment']
textCopy1 = textCopy1.apply(clean_text2)
val_organisations["comment"] = textCopy1

textCopy1 = tst_organisations['comment']
textCopy1 = textCopy1.apply(clean_text2)
tst_organisations["comment"] = textCopy1

In [None]:
#Arranging column names.

tr_organisations.columns = ['Text', 'Label']
val_organisations.columns = ['Text', 'Label']
tst_organisations.columns = ['Text', 'Label']

Combining "Irrelevant" class with the enterprises

In [None]:
#Combining labels of irrelevant class with that of enterprises 

tr_irrel_lbl = pd.DataFrame(tr_irrel_lbl, columns = ['Label'])
val_irrel_lbl = pd.DataFrame(val_irrel_lbl, columns = ['Label'])
tst_irrel_lbl = pd.DataFrame(tst_irrel_lbl, columns = ['Label'])

y_train = pd.concat([tr_organisations['Label'], tr_irrel_lbl['Label']], axis = 0, ignore_index = True)
y_valid = pd.concat([val_organisations['Label'], val_irrel_lbl['Label']], axis = 0, ignore_index = True)
y_test = pd.concat([tst_organisations['Label'], tst_irrel_lbl['Label']], axis = 0, ignore_index = True)

In [None]:
#Combining texts of irrelevant class with that of enterprises

new_train = pd.concat([tr_organisations['Text'], tr_irrel_text['Text']], axis = 0, ignore_index = True)
new_valid = pd.concat([val_organisations['Text'], val_irrel_text['Text']], axis = 0, ignore_index = True)
new_test = pd.concat([tst_organisations['Text'], tst_irrel_text['Text']], axis = 0, ignore_index = True)

In [None]:
#Adding up text and label columns + shuffling the datasets

import random

new_train_with_y = pd.concat([new_train, y_train], axis = 1)
new_valid_with_y = pd.concat([new_valid, y_valid], axis = 1)
new_test_with_y = pd.concat([new_test, y_test], axis = 1)

new_train_with_y = new_train_with_y.sample(frac = 1, random_state = 42).reset_index(drop = True)
new_valid_with_y = new_valid_with_y.sample(frac = 1, random_state = 42).reset_index(drop = True)
new_test_with_y = new_test_with_y.sample(frac = 1, random_state = 42).reset_index(drop = True)

In [None]:
#Separating test and labels for train-valid-test each and updating new_train, new_valid, new_test

new_train = new_train_with_y['Text']
y_train = new_train_with_y['Label']

new_valid = new_valid_with_y['Text']
y_valid = new_valid_with_y['Label']

new_test = new_test_with_y['Text']
y_test = new_test_with_y['Label']

Tokenization

In [None]:
# Tokenizing the sentences

count_vectorizer = CountVectorizer()
vocab_fit = count_vectorizer.fit_transform(new_train.values)
train_vec = pd.DataFrame(vocab_fit.toarray(), columns=count_vectorizer.get_feature_names(), index=new_train.index)

vocab_fit = count_vectorizer.transform(new_valid.values)
validation1_vec = pd.DataFrame(vocab_fit.toarray(), columns=count_vectorizer.get_feature_names(), index=new_valid.index)

vocab_fit = count_vectorizer.transform(new_test.values)
test_vec = pd.DataFrame(vocab_fit.toarray(), columns=count_vectorizer.get_feature_names(), index=new_test.index)

In [None]:
#Removing exceptionally short sentences from the datasets

train_vec['sum'] = train_vec.apply(sum, axis = 1)
validation1_vec['sum'] = validation1_vec.apply(sum, axis = 1)
test_vec['sum'] = test_vec.apply(sum, axis = 1)

train_vec2 = train_vec.loc[train_vec['sum'] >= 5]
validation1_vec2 = validation1_vec.loc[validation1_vec['sum'] >= 5]
test_vec2 = test_vec.loc[test_vec['sum'] >= 5]

y_train2 = y_train.loc[train_vec['sum'] >= 5]
y_valid2 = y_valid.loc[validation1_vec['sum'] >= 5]
y_test2 = y_test.loc[test_vec['sum'] >= 5]

train_vec2.drop('sum', axis = 1, inplace = True)
validation1_vec2.drop('sum', axis = 1, inplace = True)
test_vec2.drop('sum', axis = 1, inplace = True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [None]:
#We will need these word_vectors later for find_label(). 

total_vec = pd.concat([train_vec2, validation1_vec2, test_vec2], axis = 0)
y_labels = pd.concat([y_train2, y_valid2, y_test2], axis = 0)

ML algorithms and their results

In [None]:
#LogisticRegression results

from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, classification_report

lr_model = LogisticRegression(dual = True, solver = 'liblinear', max_iter = 5000)
lr_model.fit(train_vec2, y_train2)
y_pred_lr = lr_model.predict(validation1_vec2)

print(classification_report(y_valid2, y_pred_lr))
print('\n')
print(confusion_matrix(y_valid2, y_pred_lr))
print('\n')
print('Accuracy:', accuracy_score(y_valid2, y_pred_lr))
print('\n')
print('f1 score:', f1_score(y_valid2, y_pred_lr, average = 'weighted'))

              precision    recall  f1-score   support

           0       0.94      0.96      0.95       532
           1       0.95      0.96      0.95       480
           2       0.93      0.90      0.91       441
           3       0.95      0.94      0.94       433
           4       0.97      0.99      0.98       419

    accuracy                           0.95      2305
   macro avg       0.95      0.95      0.95      2305
weighted avg       0.95      0.95      0.95      2305



[[509   0  16   3   4]
 [  0 459   4  13   4]
 [ 27  11 395   6   2]
 [  4  13   8 405   3]
 [  2   2   0   0 415]]


Accuracy: 0.9470715835140998


f1 score: 0.9468666100334894


In [None]:
#SGDClassifier Results 

from sklearn.linear_model import SGDClassifier

sgd_model = SGDClassifier(random_state = 42)
sgd_model.fit(train_vec2, y_train2)
y_pred_sgd = sgd_model.predict(validation1_vec2)

print(classification_report(y_valid2, y_pred_sgd))
print('\n')
print(confusion_matrix(y_valid2, y_pred_sgd))
print('\n')
print('Accuracy:', accuracy_score(y_valid2, y_pred_sgd))
print('\n')
print('f1 score:', f1_score(y_valid2, y_pred_sgd, average = 'weighted'))

              precision    recall  f1-score   support

           0       0.93      0.97      0.95       532
           1       0.93      0.96      0.94       480
           2       0.95      0.88      0.91       441
           3       0.95      0.93      0.94       433
           4       0.98      0.99      0.98       419

    accuracy                           0.94      2305
   macro avg       0.95      0.94      0.94      2305
weighted avg       0.94      0.94      0.94      2305



[[514   1  11   4   2]
 [  2 459   3  14   2]
 [ 31  14 388   5   3]
 [  6  14   8 402   3]
 [  2   4   0   0 413]]


Accuracy: 0.9440347071583514


f1 score: 0.9437647483346557


In [None]:
#Pipeline created for Random Forest

from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, KFold, cross_val_score

pipe = Pipeline([('classifier', RandomForestClassifier(random_state = 42))])

# Create param grid.

param_grid = [
    {'classifier' : [RandomForestClassifier(random_state = 42)],
    'classifier__n_estimators' : list(range(10,101,10)),
    'classifier__max_features' : list(range(6,32,5))}
]

# Create grid search object

clf = GridSearchCV(pipe, param_grid = param_grid, cv = 5, verbose=True, n_jobs=-1)

# Fit on data

best_clf = clf.fit(train_vec2, y_train2)
y_pred_clf = clf.predict(validation1_vec2)


print(classification_report(y_valid2, y_pred_clf))
print('\n')
print(confusion_matrix(y_valid2, y_pred_clf))
print('\n')
print('Accuracy:', accuracy_score(y_valid2, y_pred_clf))
print('\n')
print('f1 score:', f1_score(y_valid2, y_pred_clf, average = 'weighted'))

Fitting 5 folds for each of 60 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:  4.1min
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed: 18.8min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 30.6min finished


              precision    recall  f1-score   support

           0       0.84      0.96      0.90       532
           1       0.91      0.96      0.93       480
           2       0.94      0.74      0.83       441
           3       0.97      0.86      0.91       433
           4       0.93      0.99      0.96       419

    accuracy                           0.91      2305
   macro avg       0.92      0.90      0.91      2305
weighted avg       0.91      0.91      0.91      2305



[[513   0  13   1   5]
 [  4 462   0   8   6]
 [ 83  19 327   4   8]
 [ 13  27   8 373  12]
 [  1   2   0   0 416]]


Accuracy: 0.9071583514099784


f1 score: 0.9053327314264936


In [None]:
#Multinomial NB results

from sklearn.naive_bayes import MultinomialNB

mb_model = MultinomialNB()
mb_model.fit(train_vec2, y_train2)

y_pred_mb = mb_model.predict(validation1_vec2) 

print(classification_report(y_valid2, y_pred_mb))
print('\n')
print(confusion_matrix(y_valid2, y_pred_mb))
print('\n')
print('Accuracy:', accuracy_score(y_valid2, y_pred_mb))
print('\n')
print('f1 score:', f1_score(y_valid2, y_pred_mb, average = 'weighted'))

              precision    recall  f1-score   support

           0       0.86      0.95      0.90       532
           1       0.94      0.97      0.95       480
           2       0.90      0.78      0.83       441
           3       0.89      0.96      0.93       433
           4       1.00      0.88      0.94       419

    accuracy                           0.91      2305
   macro avg       0.92      0.91      0.91      2305
weighted avg       0.91      0.91      0.91      2305



[[505   1  20   6   0]
 [  1 466   0  13   0]
 [ 72  14 344  11   0]
 [  4   9   5 415   0]
 [  7   8  15  19 370]]


Accuracy: 0.911062906724512


f1 score: 0.9102597923197292


Demonstration of find_label function in examples

In [None]:
#Regular preprocessing for normal texts (for demonstration purposes) - now as a generator

porter = TurkishStemmer()

stop_words = nltk.corpus.stopwords.words('turkish')

def clean_text3(string):
    message = re.sub(r'\b[\w\-.]+?@\w+?\.\w{2,4}\b', 'emailaddr', str(string))
    message = re.sub(r'(http[s]?\S+)|(\w+\.[A-Za-z]{2,4}\S*)', ' ', #Replace URLs with space
                     message)
    message = re.sub(r'₺|\$', 'money', message) #Replace money symbols with 'money'
    message = re.sub(
        r'\b(\+\d{1,2}\s)?\d?[\-(.]?\d{3}\)?[\s.-]?\d{3}[\s.-]?\d{4}\b', #Replace phone numbers with 'phonenumbr'
        'phonenumbr', message)
    message = re.sub(r'\d+(\.\d+)?', 'numbr', message)  #Replace numbers with 'numbr'
    message = re.sub(r'[^\w\d\s]', ' ', message)
    message = re.sub(r'\s+', ' ', message)
    message = re.sub(r'^\s+|\s+?$', '', message.lower())
    yield ' '.join(
    porter.stem(term)
    for term in message.split()
    if term not in set(stop_words)
    )

In [None]:
#This function applies pre-processing on any string it receives, vectorizes it and tries to predict its class with MultinomialNB. 

def find_label(sentence):

  sentence = [sentence]

  new_sentence = clean_text3(sentence)

  new_vocab_vec = count_vectorizer.transform(new_sentence)
  
  new_pred = mb_model.predict(new_vocab_vec)

  if new_pred == 0:
    return ('İgdaş (Gas distribution & billing)')
  elif new_pred == 1:
    return ('İett (Public transportation)')
  elif new_pred == 2:
    return ('İski (Water distribution & billing)')
  elif new_pred == 3:
    return ('Diğer İBB iştirakleri (Other enterprises)')
  elif new_pred == 4:
    return ('İlgisiz (Irrelevant)')

In [None]:
#The sentence below would be translated as "Rudeness of the bus driver" - which would concern İett. 

find_label('Otobüs şoförünün büyük ayıbı')

'İett (Public transportation)'

In [None]:
#The sentence below would be translated as "Exorbitant bills for gas" - which would concern İgdaş.

find_label('Fahiş doğalgaz faturası!')

'İgdaş (Gas distribution & billing)'