In [1]:
import pandas as pd
import spacy
import numpy as np
import sklearn as sk
import matplotlib.pyplot as plt
import seaborn as sns
from spacy.lang.tr import Turkish
from spacy.lang.tr.stop_words import STOP_WORDS
import string
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.base import TransformerMixin 
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing

In [3]:
reviewspath = '/Users/pinarayaz/Jupyter/NLP/data/reviews_deasciified.csv'
reviews_df = pd.read_csv(reviewspath)
reviews_df.head()

Unnamed: 0,Review,Rating
0,nicholson gene harika,4.0
1,mükemmel derece kötü diyen arkadaşın sinema bi...,5.0
2,mükemmel derecede kötü bi film hep biselerin o...,1.0
3,nasıl beğendiğinizi anlamıyorum bu filmi filmd...,1.5
4,ok harika bir film senaryo gereğinden fazla ol...,5.0


In [13]:
sentiment = []
for num in reviews_df['Rating']:
    if(num < 3.0):
        sentiment.append("negative")
    else:
        sentiment.append("positive")

reviews_df['Sentiment'] = sentiment
reviews_df.head()

Unnamed: 0,Review,Rating,Sentiment
0,nicholson gene harika,4.0,positive
1,mükemmel derece kötü diyen arkadaşın sinema bi...,5.0,positive
2,mükemmel derecede kötü bi film hep biselerin o...,1.0,negative
3,nasıl beğendiğinizi anlamıyorum bu filmi filmd...,1.5,negative
4,ok harika bir film senaryo gereğinden fazla ol...,5.0,positive


In [4]:
stopwords = list(STOP_WORDS)
punctuations = string.punctuation
nlp = Turkish()

#custom tokenizer for filtering
def custom_tokenizer(sentence):
    tokens = nlp(sentence)
    tokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in tokens]
    tokens = [word for word in tokens if word not in stopwords and word not in punctuations]
    return tokens

#custom transformer using spacy 
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]
    def fit(self, X, y, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}

#basic function to clean the text 
def clean_text(text):     
    return text.strip().lower()

In [5]:
#vectorizer and classifier
vectorizer = CountVectorizer(tokenizer = custom_tokenizer, ngram_range=(1,1)) 
classifier = LinearSVC()

In [14]:
X = reviews_df['Review']
ylabels = reviews_df['Sentiment']

#encode continuous labels
#lab_enc = preprocessing.LabelEncoder()
#ylabels_enc = lab_enc.fit_transform(ylabels)

#splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.2, random_state=42)

In [18]:
#create the pipeline to clean, tokenize, vectorize, and classify using "Count Vectorizor"
pipeline = Pipeline([("cleaner", predictors()), ('vectorizer', vectorizer), ('classifier', classifier)])

#fit our data
pipeline.fit(X_train, y_train)



Pipeline(memory=None,
     steps=[('cleaner', <__main__.predictors object at 0x11dccec18>), ('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [19]:
# calculate accuracy
print("Train Accuracy: %.2f" % pipeline.score(X_train, y_train))
print("Test Accuracy: %.2f" % pipeline.score(X_test, y_test))

#calculate precision, recall, f1 score
y_pred = pipeline.predict(y_test)
print("Precision: %.2f" % precision_score(y_test, y_pred, average="micro"))
print("Recall: %.2f" % recall_score(y_test, y_pred, average="micro"))
print("F1 Score: %.2f" % f1_score(y_test, y_pred, average="micro"))

Train Accuracy: 0.95
Test Accuracy: 0.76
Precision: 0.70
Recall: 0.70
F1 Score: 0.70


In [21]:
#perform cross validation
scores = cross_val_score(pipeline, X, ylabels, cv=5)
print("Cross Validation Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))



Cross Validation Accuracy: 0.75 (+/- 0.02)
