# Easy

In [70]:
import pandas as pd

In [71]:
df = pd.read_csv('../data/singapore_airlines_reviews.csv')
df.head()

Unnamed: 0,published_date,published_platform,rating,type,text,title,helpful_votes
0,2024-03-12T14:41:14-04:00,Desktop,3,review,We used this airline to go from Singapore to L...,Ok,0
1,2024-03-11T19:39:13-04:00,Desktop,5,review,The service on Singapore Airlines Suites Class...,The service in Suites Class makes one feel lik...,0
2,2024-03-11T12:20:23-04:00,Desktop,1,review,"Booked, paid and received email confirmation f...",Don’t give them your money,0
3,2024-03-11T07:12:27-04:00,Desktop,5,review,"Best airline in the world, seats, food, servic...",Best Airline in the World,0
4,2024-03-10T05:34:18-04:00,Desktop,2,review,Premium Economy Seating on Singapore Airlines ...,Premium Economy Seating on Singapore Airlines ...,0


Будем считать, что плохие отзывы имеют рейтинг 1,2,3
А хорошие отзывы имеют рейтинг 4,5

In [72]:
df['rating_1_or_0'] = df['rating'].apply(lambda x: int(x > 3))
df['report'] = df['title'] + '. ' +  df['text']
columns_to_drop = ['text', 'title', 'published_date', 'type', 'published_platform', 'rating', 'helpful_votes']
df.drop(columns_to_drop, axis = 1, inplace = True)
df.head()

Unnamed: 0,rating_1_or_0,report
0,0,Ok. We used this airline to go from Singapore ...
1,1,The service in Suites Class makes one feel lik...
2,0,"Don’t give them your money. Booked, paid and r..."
3,1,Best Airline in the World. Best airline in the...
4,0,Premium Economy Seating on Singapore Airlines ...


In [73]:
import re

In [74]:
def preprocess_text(text):
    return re.sub(r"[^\w\s]+", '', text).lower()

In [75]:
df = df.dropna()
df.head()

Unnamed: 0,rating_1_or_0,report
0,0,Ok. We used this airline to go from Singapore ...
1,1,The service in Suites Class makes one feel lik...
2,0,"Don’t give them your money. Booked, paid and r..."
3,1,Best Airline in the World. Best airline in the...
4,0,Premium Economy Seating on Singapore Airlines ...


In [76]:
df['report'] = df['report'].apply(preprocess_text)

In [77]:
df.head()

Unnamed: 0,rating_1_or_0,report
0,0,ok we used this airline to go from singapore t...
1,1,the service in suites class makes one feel lik...
2,0,dont give them your money booked paid and rece...
3,1,best airline in the world best airline in the ...
4,0,premium economy seating on singapore airlines ...


In [78]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df)

In [79]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer()

In [80]:
x_train = tfidf.fit_transform(train['report']).toarray()
x_test = tfidf.transform(test['report']).toarray()

In [81]:
y_train = train['rating_1_or_0']
y_test = test['rating_1_or_0']

In [82]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(max_iter=500)
model.fit(x_train, y_train)

In [83]:
from sklearn.metrics import f1_score

y_pred = model.predict(x_test)
f1_score(y_test, y_pred)

0.9413948554760011

In [84]:
from sklearn.ensemble import RandomForestClassifier

model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(x_train, y_train)

In [85]:
y_pred = model.predict(x_test)
f1_score(y_test, y_pred)

0.9010221889803043

Видим, что у LogisticRegression результат чуть лучше, чем у RandomForestClassifier, по времени они обучались примерно одинаково и довольно быстро

# Medium

In [86]:
from nltk import PorterStemmer

In [87]:
from nltk.corpus import stopwords

In [88]:
import nltk

nltk.download('punkt')
nltk.download('stopwords')
stop_words = stopwords.words("english")
stemmer = PorterStemmer()

[nltk_data] Downloading package punkt to /Users/elizabeth/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/elizabeth/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [89]:
def preprocess(text):
    tokens = nltk.word_tokenize(text.lower())
    tokens = [token for token in tokens if token.isalpha() and token not in stop_words]
    tokens = [stemmer.stem(token) for token in tokens]
    return " ".join(tokens)

In [90]:
df['stemmer'] = df['report'].apply(preprocess)

In [91]:
df.head()

Unnamed: 0,rating_1_or_0,report,stemmer
0,0,ok we used this airline to go from singapore t...,ok use airlin go singapor london heathrow issu...
1,1,the service in suites class makes one feel lik...,servic suit class make one feel like vip servi...
2,0,dont give them your money booked paid and rece...,dont give money book paid receiv email confirm...
3,1,best airline in the world best airline in the ...,best airlin world best airlin world seat food ...
4,0,premium economy seating on singapore airlines ...,premium economi seat singapor airlin worth mon...


In [92]:
train, test = train_test_split(df)

In [93]:
tfidf = TfidfVectorizer()

In [95]:
x_train = tfidf.fit_transform(train['stemmer']).toarray()
x_test = tfidf.transform(test['stemmer']).toarray()

In [96]:
y_train = train['rating_1_or_0']
y_test = test['rating_1_or_0']

In [98]:
lr = LogisticRegression(max_iter=500)
lr.fit(x_train, y_train)
y_pred = lr.predict(x_test)
print("LogisticRegression", f1_score(y_test, y_pred))

LogisticRegression 0.9435951502372167


In [99]:
rfc = RandomForestClassifier(n_estimators=100, random_state=42)
rfc.fit(x_train, y_train)
y_pred = rfc.predict(x_test)
print("RandomForestClassifier", f1_score(y_test, y_pred))

RandomForestClassifier 0.9168573607932876


In [100]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(x_train, y_train)
y_pred = knn.predict(x_test)
print("kNN", f1_score(y_test, y_pred))

kNN 0.9005847953216375


In [101]:
good = "Thank you very much to the airline for a wonderful flight! I felt great throughout the flight, the staff was very responsive and was always ready to provide their services."
bad = "Terrible flight. The staff were rude to passengers, I was moved to a seat 10 rows away from my family, because the staff were inattentive in seating passengers. The food was disgusting. I will never fly with this airline again."

In [105]:
good_stemming = preprocess(good)
bad_stemming = preprocess(bad)
print(good_stemming)
print(bad_stemming)

thank much airlin wonder flight felt great throughout flight staff respons alway readi provid servic
terribl flight staff rude passeng move seat row away famili staff inattent seat passeng food disgust never fli airlin


In [106]:
good_stemming_v = tfidf.transform([good_stemming]).toarray()
bad_stemming_v = tfidf.transform([bad_stemming]).toarray()

In [111]:
print("LogisticRegression good", lr.predict(good_stemming_v))
print("LogisticRegression bad", lr.predict(bad_stemming_v))

LogisticRegression good [1]
LogisticRegression bad [0]


In [112]:
print("RandomForestClassifier good", rfc.predict(good_stemming_v))
print("RandomForestClassifier bad", rfc.predict(bad_stemming_v))

RandomForestClassifier good [1]
RandomForestClassifier bad [0]


In [113]:
print("kNN good", knn.predict(good_stemming_v))
print("kNN bad", knn.predict(bad_stemming_v))

kNN good [1]
kNN bad [0]


Видно, что все модели справились с распознаванием, как хорошего отзыва, так и плохого