In [1]:
import spacy
from spacy.lang.tr import Turkish
from spacy.util import minibatch, compounding
import pandas as pd
import numpy as np

In [2]:
tweetspath = '/Users/pinarayaz/Jupyter/NLP/data/tweets_deasciified.csv'
tweets_df = pd.read_csv(tweetspath)
tweets_df.head()

Unnamed: 0,Tweet,Sentiment
0,Ulan Wifi'ye bağlıyım ben. Ona bağlıyken Türkc...,olumsuz
1,20 dk 1 GB internet 500 mb sadece kaşar türkce...,olumsuz
2,Ayrıca türkcell superonline reklamı kadar da k...,olumsuz
3,Türkcell çok pahalı ya,olumsuz
4,Türkcell Kaş'ta internetin çekmiyor,olumsuz


In [3]:
nlp = Turkish()
sample_review = tweets_df.Tweet[0]
sample_review

"Ulan Wifi'ye bağlıyım ben. Ona bağlıyken Türkcell ınternet paketin bitti diye nasıl mesaj atabilir bana ya? Onu da mı ödeyelim"

In [4]:
parsed_review = nlp(sample_review)
parsed_review

Ulan Wifi'ye bağlıyım ben. Ona bağlıyken Türkcell ınternet paketin bitti diye nasıl mesaj atabilir bana ya? Onu da mı ödeyelim

In [5]:
tokenized_text = pd.DataFrame()

for i, token in enumerate(parsed_review):
    tokenized_text.loc[i, 'text'] = token.text
    tokenized_text.loc[i, 'lemma'] = token.lemma_,
    tokenized_text.loc[i, 'pos'] = token.pos_
    tokenized_text.loc[i, 'tag'] = token.tag_
    tokenized_text.loc[i, 'dep'] = token.dep_
    tokenized_text.loc[i, 'shape'] = token.shape_
    tokenized_text.loc[i, 'is_alpha'] = token.is_alpha
    tokenized_text.loc[i, 'is_stop'] = token.is_stop
    tokenized_text.loc[i, 'is_punctuation'] = token.is_punct

tokenized_text.head()

Unnamed: 0,text,lemma,pos,tag,dep,shape,is_alpha,is_stop,is_punctuation
0,Ulan,Ulan,,,,Xxxx,True,False,False
1,Wifi'ye,"(Wifi'ye,)",,,,Xxxx'xx,False,False,False
2,bağlıyım,"(bağ,)",,,,xxxx,True,False,False
3,ben,"(ben,)",,,,xxx,True,True,False
4,.,"(.,)",,,,.,False,False,True


In [6]:
tweets_df['tuples'] = tweets_df.apply(lambda row: (row['Tweet'],row['Sentiment']), axis=1)
train = tweets_df['tuples'].tolist()

In [7]:
nlp = spacy.blank("tr")  # create blank Language class

if "textcat" not in nlp.pipe_names:
    textcat = nlp.create_pipe("textcat", config={"exclusive_classes": True, "architecture": "simple_cnn"})
    nlp.add_pipe(textcat, last=True)
else:
    textcat = nlp.get_pipe("textcat")

In [8]:
textcat.add_label("POSITIVE")
textcat.add_label("NEUTRAL")
textcat.add_label("NEGATIVE")

1

In [9]:
def load_data(limit=0, split=0.8):
    train_data = train
    np.random.shuffle(train_data)
    train_data = train_data[-limit:]
    texts, labels = zip(*train_data)
    cats = [{"POSITIVE": y == "olumlu", "NEUTRAL": y == "notr", "NEGATIVE": y == "olumsuz"} for y in labels]
    split = int(len(train_data) * split)
    return (texts[:split], cats[:split]), (texts[split:], cats[split:])

In [10]:
def evaluate(tokenizer, textcat, texts, cats):
    docs = (tokenizer(text) for text in texts)
    tp = 1e-8  # True positives
    fp = 1e-8  # False positives
    fn = 1e-8  # False negatives
    tn = 1e-8  # True negatives
    for i, doc in enumerate(textcat.pipe(docs)):
        gold = cats[i]
        for label, score in doc.cats.items():
            if label not in gold:
                continue
            if score >= 0.5 and gold[label] >= 0.5:
                tp += 1.
            elif score >= 0.5 and gold[label] < 0.5:
                fp += 1.
            elif score < 0.5 and gold[label] < 0.5:
                tn += 1
            elif score < 0.5 and gold[label] >= 0.5:
                fn += 1
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f_score = 2 * (precision * recall) / (precision + recall)
    accuracy = (tp + tn) / (tp + fp + fn + tn)
    return {'textcat_p': precision, 'textcat_r': recall, 'textcat_f': f_score, 'textcat_a': accuracy}

In [11]:
# load the dataset
lim = 17289
print("Loading Tweets dataset...")
(train_texts, train_cats), (dev_texts, dev_cats) = load_data(limit=lim)
print("Using {} examples ({} training, {} evaluation)".format(lim, len(train_texts), len(dev_texts)))
train_data = list(zip(train_texts, [{'cats': cats} for cats in train_cats]))

Loading Tweets dataset...
Using 17289 examples (13831 training, 3458 evaluation)


In [12]:
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'textcat']
with nlp.disable_pipes(*other_pipes):  # only train textcat
    optimizer = nlp.begin_training()
    print("Training the model...")
    print('{:^5}\t{:^5}\t{:^5}\t{:^5}\t{:^5}'.format('LOSS', 'P', 'R', 'F', 'Accuracy'))
    for i in range(10):
        losses = {}
        # batch up the examples using spaCy's minibatch
        batches = minibatch(train_data, size=compounding(4., 32., 1.001))
        for batch in batches:
            texts, annotations = zip(*batch)
            nlp.update(texts, annotations, sgd=optimizer, drop=0.2, losses=losses)
        with textcat.model.use_params(optimizer.averages):
            # evaluate on the dev data split off in load_data()
            scores = evaluate(nlp.tokenizer, textcat, dev_texts, dev_cats)
        print('{0:.3f}\t{1:.3f}\t{2:.3f}\t{3:.3f}\t{4:.3f}'
              .format(losses['textcat'], scores['textcat_p'],
                      scores['textcat_r'], scores['textcat_f'], scores['textcat_a']))

Training the model...
LOSS 	  P  	  R  	  F  	Accuracy
21.383	0.713	0.553	0.623	0.777
16.540	0.708	0.601	0.650	0.784
12.613	0.698	0.615	0.653	0.783
9.317	0.688	0.629	0.657	0.781
7.354	0.686	0.643	0.664	0.783
5.650	0.682	0.655	0.668	0.783
4.501	0.679	0.657	0.668	0.782
3.891	0.679	0.661	0.670	0.783
3.479	0.676	0.659	0.667	0.781
3.375	0.677	0.665	0.671	0.783


In [13]:
# test the trained model
test_text = "bence fiyatlar gayet normal çekim kalitesine göre. dağda bayırda bile çekiyor daha ne olsun!"
doc = nlp(test_text)
print(test_text, doc.cats)

bence fiyatlar gayet normal çekim kalitesine göre. dağda bayırda bile çekiyor daha ne olsun! {'POSITIVE': 0.010449743829667568, 'NEUTRAL': 6.190087151480839e-05, 'NEGATIVE': 0.9894883036613464}


In [14]:
#save model
output_dir = '/Users/pinarayaz/Jupyter/NLP/spacy_models/tweets_spacy'
if output_dir is not None:
    with nlp.use_params(optimizer.averages):
        nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

Saved model to /Users/pinarayaz/Jupyter/NLP/spacy_models/tweets_spacy


In [15]:
#test the saved model
test_text = "Evlat olsun sevilmezsin #Turkcel. Seni çekemicem sanırım , yarın en yakın yerden iptal edicem!!!"
nlp_test = spacy.load(output_dir)
doc2 = nlp_test(test_text)
print(test_text, doc2.cats)

score = 0
for cat in doc2.cats:
    if(doc2.cats[cat] > score):
        sentiment = cat
        score = doc2.cats[cat]
print("Sentiment:", sentiment)

Evlat olsun sevilmezsin #Turkcel. Seni çekemicem sanırım , yarın en yakın yerden iptal edicem!!! {'POSITIVE': 0.14946793019771576, 'NEUTRAL': 0.6209986209869385, 'NEGATIVE': 0.22953341901302338}
Sentiment: NEUTRAL
