In [2]:
import xml.etree.cElementTree as et
import pandas as pd
import spacy
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import seaborn as sns
import json
import xmltodict
from spacy.lang.tr import Turkish
from  spacy.lang.tr.stop_words import STOP_WORDS
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score 
from sklearn.base import TransformerMixin 
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing

In [3]:
#convert xml to pandas dataframe
def getvalueofnode(node):
    return node.text if node is not None else None

parsed_xml = et.parse("TREMODATA.xml")
dfcols = ['Entry', 'ValidatedEmotion']
df_xml = pd.DataFrame(columns=dfcols)

for node in parsed_xml.getroot():
    entry = node.find('Entry')
    validated_emotion = node.find('ValidatedEmotion')
    df_xml = df_xml.append(pd.Series([getvalueofnode(entry), getvalueofnode(validated_emotion)], 
                                     index=dfcols), ignore_index=True)

In [5]:
# To build a list of stop words for filtering
stopwords = list(STOP_WORDS)

In [17]:
punctuations = string.punctuation
parser = Turkish()
#parser = spacy.load("xx_ent_wiki_sm")

In [7]:
def my_tokenizer(sentence):
    mytokens = parser(sentence)
    mytokens = [ word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in mytokens ]
    mytokens = [ word for word in mytokens if word not in stopwords and word not in punctuations ]
    return mytokens

In [8]:
#Custom transformer using spaCy 
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]
    def fit(self, X, y, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}

# Basic function to clean the text 
def clean_text(text):     
    return text.strip().lower()

In [23]:
# Vectorization
vectorizer = CountVectorizer(tokenizer = my_tokenizer, ngram_range=(1,1)) 
classifier = LinearSVC(max_iter=4000)

In [10]:
# Splitting Data Set
X = df_xml['Entry']
ylabels = df_xml['ValidatedEmotion']

X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.2, random_state=42)
df_xml.head()

Unnamed: 0,Entry,ValidatedEmotion
0,her yeni gün bir mutluluk,Happy
1,gece kimsenin olmadığı sokaklardan geçerken ço...,Fear
2,gerçekleşemeyen hayaller,Sadness
3,arkadaş kaybetmek beni üzüyor,Sadness
4,insanların çıkarcı olmalarından tiksiniyorum,Disgust


In [11]:
# Create the  pipeline to clean, tokenize, vectorize, and classify using "Count Vectorizor"
pipe_countvect = Pipeline([("cleaner", predictors()),
                 ('vectorizer', vectorizer),
                 ('classifier', classifier)])
# Fit our data
pipe_countvect.fit(X_train, y_train)

# Accuracy
print("Test Accuracy: ",pipe_countvect.score(X_test,y_test))
print("Train Accuracy: ",pipe_countvect.score(X_train,y_train))

Test Accuracy:  0.8149908592321755
Train Accuracy:  0.9775137111517368


In [12]:
#perform cross validation
scores = cross_val_score(pipe_countvect, X, ylabels, cv=5)
print("Cross Validation Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Cross Validation Accuracy: 0.81 (+/- 0.07)


In [36]:
#xml to json
with open("TREMODATA.xml", 'r') as f:
    xmlString = f.read()

jsonString = json.dumps(xmltodict.parse(xmlString), ensure_ascii=False, indent=4).encode('utf8')
with open("output.json", 'w') as f:
    f.write(jsonString.decode('utf8'))

In [13]:
#evaluate on reviews dataset
reviewsfile = 'reviews.xlsx'
reviews_df = pd.read_excel(reviewsfile, sheet_name=0, header=0, index_col=False, keep_default_na=True)
reviews_df.head()

Unnamed: 0,Review,Rating
0,nicholson gene harika,4.0
1,mukemmel derece kotu diyen arkadasin sinema bi...,5.0
2,mukemmel derecede kotu bi film hep biselerin o...,1.0
3,nasil begendiginizi anlamiyorum bu filmi filmd...,1.5
4,ok harika bir film senaryo gereginden fazla ol...,5.0


In [14]:
ratings = reviews_df.Rating.unique()
print(ratings)
print(reviews_df.Review[10])

[4.  5.  1.  1.5 3.5 3.  4.5 2.  2.5 0. ]
iliskiler uzerine soyleyecek sozu olan enteresan bir film benim beklentilerimi tam karsilamadi ama bir fransiz sinemacisindan istanbul u da izlemek gerek 


In [15]:
X = reviews_df['Review']
ylabels = reviews_df['Rating']

#encode continuous labels
lab_enc = preprocessing.LabelEncoder()
ylabels_enc = lab_enc.fit_transform(ylabels)

X_train, X_test, y_train, y_test = train_test_split(X, ylabels_enc, test_size=0.2, random_state=42)
print(np.unique(ylabels_enc))

[0 1 2 3 4 5 6 7 8 9]


In [24]:
pipe_countvect.fit(X_train, y_train)

# Accuracy
print("Test Accuracy: ", pipe_countvect.score(X_test, y_test))
print("Train Accuracy: ", pipe_countvect.score(X_train, y_train))



Test Accuracy:  0.3100885967419263
Train Accuracy:  0.9534509859959989


In [29]:
#perform cross validation
scores = cross_val_score(pipe_countvect, X, ylabels_enc, cv=5)
print("Cross Validation Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))



Cross Validation Accuracy: 0.31 (+/- 0.02)


In [16]:
# Another random review
pipe_countvect.predict(["cok kotu degil ama yine de bos"])

array([4])

In [17]:
#evaluate on tweets dataset
traintweetsfile = 'train_tweets.xlsx'
traintweets_df = pd.read_excel(traintweetsfile, sheet_name=0, header=0, index_col=False, keep_default_na=True)
traintweets_df.head()

Unnamed: 0,Tweet,Sentiment
0,Ulan Wifi'ye bağlıyım ben. Ona bağlıyken Turkc...,olumsuz
1,20 dk 1 GB internet 500 mb sadece kaşar turkce...,olumsuz
2,Ayrıca turkcell superonline reklamı kadar da k...,olumsuz
3,Turkcell çok pahalı ya,olumsuz
4,Turkcell Kaş'ta internetin cekmiyor,olumsuz


In [18]:
testtweetsfile = 'test_tweets.xlsx'
testtweets_df = pd.read_excel(testtweetsfile, sheet_name=0, header=0, index_col=False, keep_default_na=True)
testtweets_df.head()

Unnamed: 0,Tweet,Sentiment
0,Turkcell'e kızgınım. Ve bu kızgınlık sanırım a...,olumsuz
1,turkcell kadar şerefsiz misiniz ya,olumsuz
2,Burdan Turkcell'e sesleniyorum o 3 tl haram olsun,olumsuz
3,Hayatımda turkcell kadar kazık 1 operatör görm...,olumsuz
4,Turkcell gözümde son demlerini yaşıyor hattı d...,olumsuz


In [31]:
X_train = traintweets_df['Tweet']
y_train = traintweets_df['Sentiment']
X_test = testtweets_df['Tweet']
y_test = testtweets_df['Sentiment']

pipe_countvect.fit(X_train, y_train)

'''
# Predicting with a test dataset
sample_prediction = pipe_countvect.predict(X_test)
# Prediction Results
for (sample,pred) in zip(X_test,sample_prediction):
    print(sample,"Prediction=>",pred)
'''

# Accuracy
print("Test Accuracy: ",pipe_countvect.score(X_test, y_test))
print("Train Accuracy: ",pipe_countvect.score(X_train, y_train))

Test Accuracy:  0.6496962684408447
Train Accuracy:  0.9791064198958935


In [20]:
# Another random review
pipe_countvect.predict(["turkcell berbat"])

array(['olumsuz'], dtype=object)

In [32]:
#perform cross validation
X = pd.concat([X_train, X_test])
ylabels = pd.concat([y_train, y_test])
scores = cross_val_score(pipe_countvect, X, ylabels, cv=5)
print("Cross Validation Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))



Cross Validation Accuracy: 0.63 (+/- 0.04)
