### 8.3 Metin Sınıflandırma Part 1

In [1]:
"""
spam veri seti -> spam ve ham -> binanry classification with Decision Tree
"""

# import libraries 

import pandas as pd 

# veri yükle 
data = pd.read_csv("datasets/metin_siniflandirma_spam_veri_seti.csv", encoding="latin-1")
data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)
data.columns = ['label', 'text']

# EDA : Kesifsel veri analizi : missing value

print(data.isnull().sum())


label    0
text     0
dtype: int64


#### 8.4 Part 2

In [4]:
# text cleaning and preprocessing : ozel karakterler, lowercse, tokenization , stopwords, lemmatization

import nltk 
nltk.download('stopwords') # cok kullanılan ve anlam tasimayan sozcukler imetin icersinden cıkartalım
nltk.download('wordnet') # lemma bulmak icin gerekli olan veriseti
nltk.download('omw-1.4') # wordnete ait farkli dillerin kelime anlamlarını iceren bir veri seti 

import re 
from nltk.corpus import stopwords # stopwords lerden kurtulmak icin 
from nltk.stem import WordNetLemmatizer # lemmatization

text = list(data.text)
lemmatizer = WordNetLemmatizer()

corpus = []
for i in range(len(text)):
    r = re.sub("[^a-zA-Z]", " ", text[i]) # metin icersinde harf olmayan tum karakterleri boslukla degistir
    r = r.lower() # tum harfleri kucuk harfe cevir
    r = r.split() # kelimeleri ayir
    r = [word for word in r if word not in stopwords.words('english')] # stopwordslerden kurtul
    r = [lemmatizer.lemmatize(word) for word in r]
    r = " ".join(r)
    corpus.append(r)
data['text2'] = corpus 

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mfurk\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mfurk\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\mfurk\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


#### 8.5 Part 3 

In [5]:
# model training and evaluation

X = data['text2']
y = data['label']

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=42)

# feature extraction : BoW
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X_train_cv = cv.fit_transform(X_train)

# classifier training : model training and evaluation 
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
dt.fit(X_train_cv, y_train)

X_test_cv = cv.transform(X_test)

# prediction
prediction = dt.predict(X_test_cv)

from sklearn.metrics import confusion_matrix

c_matrix = confusion_matrix(y_test, prediction)

acc = 100*(sum(sum(c_matrix)) - c_matrix[1,0] - c_matrix[0 ,1]) / sum(sum(c_matrix))
print(f"accuracy : {acc:.2f}%")

accuracy : 96.77%


### 8.6 Varlık ismi Tanıma ( Named Entity Recognition ( NER ) ) Part 1 

![ScreenS/8.6_NER.PNG](ScreenS/8.6_NER.PNG)

#### 8.7 NER Part 2 

In [None]:
"""
warlik ismi tanima  mein ( cumle ) -> metin icersinde bulunan vrlik isimlerini tanımla 
"""

%pip install spacy -q

import pandas as pd 
import spacy 

# spacy modeli ile varlik ismi tanimla 
!python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm") # spacy kutuphanesi ingilizce dil modeli 

content = "Alice works at Amazon and lives in London. She visited the British Museum last weekend."

doc = nlp(content) # bu islem metindeki varlıklari ( entities ) analiz eder 

for ent in doc.ents: 
    # ent.text  varlik ismi
    # ent.start_char ve ent.end_char : varligi metindekki baslangic ve bitis karakterler 
    # print(ent.text, ent.start_char, ent.end_char, ent.label_)
    print(ent.text,ent.label_)

# ent.lemma_ : varligin kok hali 
entites = [(ent.text, ent.label_, ent.lemma_) for ent in doc.ents]

# varlik listesindi pandas df e cevir 
df = pd.DataFrame(entites, columns=["text", "type", "lemma"])

Note: you may need to restart the kernel to use updated packages.
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
     ---------------------------------------- 0.0/12.8 MB ? eta -:--:--
     -- ------------------------------------- 0.8/12.8 MB 6.7 MB/s eta 0:00:02
     ---------------------- ----------------- 7.3/12.8 MB 23.8 MB/s eta 0:00:01
     --------------------------------------- 12.8/12.8 MB 27.7 MB/s eta 0:00:00
Installing collected packages: en-core-web-sm
Successfully installed en-core-web-sm-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
Alice PERSON
Amazon ORG
London GPE
the British Museum ORG
last weekend DATE


### 8.8 Morfolojik Analiz ( Morphological Analysis ) Part 1

In [2]:
# Örn : "kitaplar " kelimesinin kökünü " kitap " ve ekinin " -lar "
#   oldugunu belirleyerek kelimenin cogul oldugunu tespit etmek. 

# kullanim alanları ; 
# Dil ögrenme araçlari
# Doğal dil işleme
# Otomatik çeviri

#### 8.9 Morphological Part 2 

In [1]:
import spacy

nlp = spacy.load("en_core_web_sm")

# incelenecek olan kelime yada kelimeler

word = "I go to schools"

# kelimeyi nlp isleminden gecir
doc = nlp(word)

for token in doc:
    
    print(f"Text: {token.text}")            # kelimenin kendisi
    print(f"Lemma: {token.lemma_}")         # kelimenin kok hali
    print(f"POS: {token.pos_}")             # kelimenin dilbilgisel ozelligi
    print(f"Tag: {token.tag_}")             # kelimenin detayli dilbilgisel ozelligi
    print(f"Dependency: {token.dep_}")      # kelimenin rolu 
    print(f"Shape: {token.shape_}")         # karakter yapisi
    print(f"Is alpha: {token.is_alpha}")    # kelimenin yalnizca alfabetik karakterlerden olusup olusmadigini kontrol eder
    print(f"Is stop: {token.is_stop}")      # kelimenin stop words olup olmadigi
    print(f"Morfoloji: {token.morph}")      # kelimenin morfolojik ozelliklerini verir 
    print(f"Is plural: {'Number=Plur' in token.morph}") # kelimenin cogul olup olmadigi
    print()

Text: I
Lemma: I
POS: PRON
Tag: PRP
Dependency: nsubj
Shape: X
Is alpha: True
Is stop: True
Morfoloji: Case=Nom|Number=Sing|Person=1|PronType=Prs
Is plural: False

Text: go
Lemma: go
POS: VERB
Tag: VBP
Dependency: ROOT
Shape: xx
Is alpha: True
Is stop: True
Morfoloji: Tense=Pres|VerbForm=Fin
Is plural: False

Text: to
Lemma: to
POS: ADP
Tag: IN
Dependency: prep
Shape: xx
Is alpha: True
Is stop: True
Morfoloji: 
Is plural: False

Text: schools
Lemma: school
POS: NOUN
Tag: NNS
Dependency: pobj
Shape: xxxx
Is alpha: True
Is stop: False
Morfoloji: Number=Plur
Is plural: True



### 8.10 Metin Parçası Etiketleme ( Part of Speech ( POS )) Part 1

![ScreenS/8.10_POS.PNG](ScreenS/8.10_POS.PNG)

#### POS Part 2

In [2]:
import spacy

nlp = spacy.load("en_core_web_sm")

sentence1 = "What is the weather like today or tomorrow"
doc1 = nlp(sentence1)

for token in doc1:
    print(token.text, token.pos_)

What PRON
is AUX
the DET
weather NOUN
like ADP
today NOUN
or CCONJ
tomorrow NOUN


### Kelime Anlamı Belirsizlği Giderme ( Word sense Disambiguation ) Part 1 

In [3]:
# WSD, bir kelimenin farklı anlamları arasında dogru olanı baglama göre seçme işlemidir 
# kullanım alanlari ; 
#  makine çevirisi 
#  arama motorlari 
#  dogal dil işlmee 

#### WSD Part 2 

In [1]:
import nltk
from nltk.wsd import lesk

# gerekli nltk paketlerini indir
nltk.download("wordnet")
nltk.download("own-1.4")
nltk.download("punkt")

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mfurk\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Error loading own-1.4: Package 'own-1.4' not found in
[nltk_data]     index
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mfurk\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
# ilk cumle
s1 = " I go to the bank to deposit money"
w1 = "bank"

sense1 = lesk(nltk.word_tokenize(s1), w1)
print(f"Cumle: {s1}")
print(f"Word: {w1}")
print(f"Sense: {sense1.definition()}")

Cumle:  I go to the bank to deposit money
Word: bank
Sense: a container (usually with a slot in the top) for keeping money at home


In [3]:
s2 = "The river bank is flooded after the heavy rain"
w2 = "bank"
sense2 = lesk(nltk.word_tokenize(s2), w2)

print(f"Cumle: {s2}")
print(f"Word: {w2}")
print(f"Sense: {sense2.definition()}")

Cumle: The river bank is flooded after the heavy rain
Word: bank
Sense: a slope in the turn of a road or track; the outside is higher than the inside in order to reduce the effects of centrifugal force


#### WSD Part 3

In [None]:
from pywsd.lesk import simple_lesk, adapted_lesk, cosine_lesk

# ornek cumle
sentences = [
    "I go to the bank to deposit money",
    "The river bank was flooded after the heavy rain"]

word = "bank"

for s in sentences:
    
    print(f"Sentence: {s}")
    
    sense_simple_lesk = simple_lesk(s, word)
    print(f"Sense simple: {sense_simple_lesk.definition()}")
    
    sense_adapted_lesk = adapted_lesk(s, word)
    print(f"Sense adapted: {sense_adapted_lesk.definition()}")
    
    sense_cosine_lesk = cosine_lesk(s, word)
    print(f"Sense cosine: {sense_cosine_lesk.definition()}")
    
"""
Sentence: I go to the bank to deposit money (banka)
Sense simple: a financial institution that accepts deposits and channels the money into lending activities
Sense adapted: a financial institution that accepts deposits and channels the money into lending activities
Sense cosine: a container (usually with a slot in the top) for keeping money at home

Sentence: The river bank was flooded after the heavy rain
Sense simple: sloping land (especially the slope beside a body of water)
Sense adapted: sloping land (especially the slope beside a body of water)
Sense cosine: a supply or stock held in reserve for future use (especially in emergencies)
"""

### Duygu Analizi ( Sentiment Analysis ) Part 1 

#### SA Part 2 

In [3]:
"""
binanry classification problemi 
"""
# import libraries 
import pandas as pd 
import nltk 

from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download("vader_lexicon")
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("omw-1.4")

df = pd.read_csv('datasets/duygu_analizi_amazon_veri_seti.csv')

# text cleaning ve presprocessing 
lemmatizer = WordNetLemmatizer()

def clean_preprocess_data(text):

    # tokenize 
    tokens = word_tokenize(text.lower())

    # stop words
    filtered_tokens = [token for token in tokens if token not in stopwords.words("english")]

    # lemmatize
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    
    # join words
    processed_text = " ".join(lemmatized_tokens)
    
    return processed_text
df["reviewText2"] = df["reviewText"].apply(clean_preprocess_data)

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\mfurk\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\mfurk\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mfurk\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mfurk\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\mfurk\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


#### SA Part 3 

In [None]:
# sentiment analysis (nltk)
analyzer = SentimentIntensityAnalyzer()

def get_sentiments(text):
    
    score = analyzer.polarity_scores(text)
    
    sentiment = 1 if score["pos"] > 0 else 0
    
    return sentiment

df["sentiment"] = df["reviewText2"].apply(get_sentiments)

# evaluation - test
from sklearn.metrics import confusion_matrix, classification_report

cm = confusion_matrix(df["Positive"], df["sentiment"])

print (cm)

cr = classification_report(df["Positive"], df["sentiment"])

print(f"Classification report: \n{cr}")


[[ 1131  3636]
 [  576 14657]]
Classification report: 
              precision    recall  f1-score   support

           0       0.66      0.24      0.35      4767
           1       0.80      0.96      0.87     15233

    accuracy                           0.79     20000
   macro avg       0.73      0.60      0.61     20000
weighted avg       0.77      0.79      0.75     20000

