In [1]:
import pandas as pd
import numpy as np
import sklearn as sk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.base import TransformerMixin 
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import spacy
from spacy.util import minibatch, compounding

In [9]:
#read twt dataset
path = '/Users/pinarayaz/Jupyter/NLP/data/twt_preprocessed.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,Tweet,Label
0,kız nutellayı abartıyo sosyal medyasında kendi...,negative
1,ooooo dedikodu alırım bi dal :(,negative
2,:(,negative
3,bıkmayacaksın :),positive
4,whatsapp grubuna gelecekler favlasın muhabbetl...,positive


## sklearn LinearSVC

In [24]:
#vectorizer and classifier
vectorizer = CountVectorizer(ngram_range=(1,1))
classifier = LinearSVC()

In [25]:
#split the dataset
X = df['Tweet'].values.astype('U')
ylabels = df['Label'].values.astype('U')

X_train, X_test, y_train, y_test = train_test_split(X, ylabels, test_size=0.2, random_state=42)

In [26]:
#create the pipeline to vectorize and classify
pipeline = Pipeline([('vectorizer', vectorizer),
                     ('classifier', classifier)
                    ])
    
#fit our data
pipeline.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('vectorizer', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
       ...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

In [27]:
# calculate accuracy
print("Train Accuracy: %.2f" % pipeline.score(X_train, y_train))
print("Test Accuracy: %.2f" % pipeline.score(X_test, y_test))

#calculate precision, recall, f1 score
y_pred = pipeline.predict(y_test)
print("Precision: %.2f" % precision_score(y_test, y_pred, average="micro"))
print("Recall: %.2f" % recall_score(y_test, y_pred, average="micro"))
print("F1 Score: %.2f" % f1_score(y_test, y_pred, average="micro"))

Train Accuracy: 0.98
Test Accuracy: 0.73
Precision: 0.50
Recall: 0.50
F1 Score: 0.50


## training spacy blank model

In [16]:
df['Tweet'] = df['Tweet'].values.astype('U')
df['Label'] = df['Label'].values.astype('U')

df['tuples'] = df.apply(lambda row: (row['Tweet'], row['Label']), axis=1)
train = df['tuples'].tolist()
train[:10]

[('kız nutellayı abartıyo sosyal medyasında kendinden nutella üzgün :(',
  'negative'),
 ('ooooo dedikodu alırım bi dal :(', 'negative'),
 (':(', 'negative'),
 ('bıkmayacaksın :)', 'positive'),
 ('whatsapp grubuna gelecekler favlasın muhabbetler fena :)', 'positive'),
 ('bebeğim çalışıyorum :(', 'negative'),
 ('vasip şahin candır :d', 'positive'),
 ('komsular modemi gec aciyorlar mk :( :( :(', 'negative'),
 ('ahhhh ketek uler :(', 'negative'),
 ("yedi güzel adam yeni günü'nde yeni saatinde pazartesi 55 te ..",
  'positive')]

In [17]:
nlp = spacy.blank("tr")  # create blank Language class

if "textcat" not in nlp.pipe_names:
    textcat = nlp.create_pipe("textcat", config={"exclusive_classes": True, "architecture": "simple_cnn"})
    nlp.add_pipe(textcat, last=True)
else:
    textcat = nlp.get_pipe("textcat")

textcat.add_label("POSITIVE")
textcat.add_label("NEGATIVE")

1

In [18]:
def load_data(limit=0, split=0.8):
    train_data = train
    np.random.shuffle(train_data)
    train_data = train_data[-limit:]
    texts, labels = zip(*train_data)
    cats = [{"POSITIVE": y == 'positive', "NEGATIVE": y == 'negative'} for y in labels]
    split = int(len(train_data) * split)
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])

In [19]:
def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 1e-8  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 1e-8  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f_score = 2 * (precision * recall) / (precision + recall)
    accuracy = (tp + tn) / (tp + fp + fn + tn)
    return {'textcat_p': precision, 'textcat_r': recall, 'textcat_f': f_score, 'textcat_a': accuracy}

In [20]:
# load the dataset
lim = len(train)
(train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=lim)
print("Using {} examples ({} training, {} evaluation)".format(lim, len(train_texts), len(dev_texts)))
train_data = list(zip(train_texts, [{'cats': cats} for cats in train_cats]))

Using 32000 examples (25600 training, 6400 evaluation)


In [21]:
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
with nlp.disable_pipes(*other_pipes):  # only train textcat
    optimizer = nlp.begin_training()
    print("Training the model...")
    print('{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F', 'Accuracy'))
    for i in range(10):
        losses = {}
        # batch up the examples using spaCy's minibatch
        batches = minibatch(train_data, size=compounding(4., 32., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
        with textcat.model.use_params(optimizer.averages):
            # evaluate on the dev data split off in load_data()
            scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
        print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}\t{4:.3f}'
              .format(losses['textcat'], scores['textcat_p'],
                      scores['textcat_r'], scores['textcat_f'], scores['textcat_a']))

Training the model...
LOSS 	  P  	  R  	  F  	Accuracy
3.670	0.970	0.970	0.970	0.970
1.712	0.969	0.969	0.969	0.969
0.901	0.972	0.972	0.972	0.972
0.581	0.972	0.972	0.972	0.972
0.424	0.972	0.972	0.972	0.972
0.396	0.973	0.973	0.973	0.973
0.316	0.973	0.973	0.973	0.973
0.365	0.972	0.972	0.972	0.972
0.313	0.973	0.973	0.973	0.973
0.290	0.972	0.972	0.972	0.972


In [22]:
#save model
output_dir = '/Users/pinarayaz/Jupyter/NLP/spacy_models/twt_spacy'
if output_dir is not None:
    with nlp.use_params(optimizer.averages):
        nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

Saved model to /Users/pinarayaz/Jupyter/NLP/spacy_models/twt_spacy


In [23]:
#test the saved model
test_text = "Lakin bu fani hayatta bize işkence yapmayın be. Midem bulandı resmen."
nlp_test = spacy.load(output_dir)
doc2 = nlp_test(test_text)
print(test_text, doc2.cats)

score = 0
for cat in doc2.cats:
    if(doc2.cats[cat] > score):
        sentiment = cat
        score = doc2.cats[cat]
print("Sentiment:", sentiment)

Lakin bu fani hayatta bize işkence yapmayın be. Midem bulandı resmen. {'POSITIVE': 4.2773812310770154e-05, 'NEGATIVE': 0.9999572038650513}
Sentiment: NEGATIVE
