In [11]:
import pandas as pd
import numpy as np
import sklearn as sk
import string
import spacy

from spacy.lang.tr import Turkish
from  spacy.lang.tr.stop_words import STOP_WORDS

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.base import TransformerMixin 
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score

#sentiment analysis with DBN (deep belief network)
from dbn.tensorflow import SupervisedDBNClassification

In [12]:
#read tremo dataset
tremopath = '/Users/pinarayaz/Jupyter/NLP/data/tremo.csv'
tremo_df = pd.read_csv(tremopath)
tremo_df.head()

Unnamed: 0,Entry,ValidatedEmotion
0,her yeni gün bir mutluluk,Happy
1,gece kimsenin olmadığı sokaklardan geçerken ço...,Fear
2,gerçekleşemeyen hayaller,Sadness
3,arkadaş kaybetmek beni üzüyor,Sadness
4,insanların çıkarcı olmalarından tiksiniyorum,Disgust


In [13]:
stopwords = list(STOP_WORDS)
punctuations = string.punctuation
nlp = Turkish()
#nlp = spacy.load("xx_ent_wiki_sm")

In [14]:
#stopwords and punctuations
stopwords = list(STOP_WORDS)
punctuations = string.punctuation
nlp = Turkish()
#nlp = spacy.load("xx_ent_wiki_sm")

#custom tokenizer for filtering
def custom_tokenizer(sentence):
    tokens = nlp(sentence)
    tokens = [word.lemma_.lower().strip() if word.lemma_ != "-PRON-" else word.lower_ for word in tokens]
    tokens = [word for word in tokens if word not in stopwords and word not in punctuations]
    return tokens

#custom transformer using spacy 
class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [clean_text(text) for text in X]
    def fit(self, X, y, **fit_params):
        return self
    def get_params(self, deep=True):
        return {}

#basic function to clean the text 
def clean_text(text):     
    return text.strip().lower()

#for transforming sparse matrix to dense
class DenseTransformer(TransformerMixin):
    def fit(self, X, y=None, **fit_params):
        return self
    def transform(self, X, y=None, **fit_params):
        dense = X.todense()
        rows = dense.shape[0]
        cols = dense.shape[1]
        to_pad = np.zeros((rows, rows-cols))
        result = np.concatenate((dense, to_pad), axis=1)
        print(result.shape)
        return result

In [15]:
#vectorizer and classifier
vectorizer = CountVectorizer(tokenizer = custom_tokenizer, ngram_range=(1,1))
#vectorizer = TfidfVectorizer(tokenizer = custom_tokenizer)

classifier = LinearSVC()
#classifier = SupervisedDBNClassification(hidden_layers_structure=[256, 256],
                                         #learning_rate_rbm=0.05,
                                         #learning_rate=0.1,
                                         #n_epochs_rbm=1, #10
                                         #n_iter_backprop=10, #100
                                         #batch_size=32,
                                         #activation_function='relu',
                                         #dropout_p=0.2)

In [16]:
#split the dataset
X = tremo_df['Entry']
ylabels = tremo_df['ValidatedEmotion']
X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.2, random_state=42)

In [17]:
#create the pipeline to clean, tokenize, vectorize and classify
pipeline = Pipeline([
                    ("cleaner", predictors()), 
                    ('vectorizer', vectorizer),
                    #('to_dense', DenseTransformer()), #for dbn model
                    ('classifier', classifier)
                    ])
    
#fit our data
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('cleaner', <__main__.predictors object at 0x1254d9ba8>), ('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [18]:
# calculate accuracy
print("Train Accuracy: %.2f" % pipeline.score(X_train, y_train))
print("Test Accuracy: %.2f" % pipeline.score(X_test, y_test))

#calculate precision, recall, f1 score
y_pred = pipeline.predict(y_test)
print("Precision: %.2f" % precision_score(y_test, y_pred, average="micro"))
print("Recall: %.2f" % recall_score(y_test, y_pred, average="micro"))
print("F1 Score: %.2f" % f1_score(y_test, y_pred, average="micro"))

Train Accuracy: 0.94
Test Accuracy: 0.86
Precision: 0.20
Recall: 0.20
F1 Score: 0.20


In [19]:
#perform cross validation
scores = cross_val_score(pipeline, X, ylabels, cv=5)
print("Cross Validation Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Cross Validation Accuracy: 0.85 (+/- 0.08)
