In [1]:
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline, make_pipeline
from catboost import CatBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RepeatedStratifiedKFold

# Easy

In [2]:
df = pd.read_csv('../data/singapore_airlines_reviews.csv')
df.head()

Unnamed: 0,published_date,published_platform,rating,type,text,title,helpful_votes
0,2024-03-12T14:41:14-04:00,Desktop,3,review,We used this airline to go from Singapore to L...,Ok,0
1,2024-03-11T19:39:13-04:00,Desktop,5,review,The service on Singapore Airlines Suites Class...,The service in Suites Class makes one feel lik...,0
2,2024-03-11T12:20:23-04:00,Desktop,1,review,"Booked, paid and received email confirmation f...",Don’t give them your money,0
3,2024-03-11T07:12:27-04:00,Desktop,5,review,"Best airline in the world, seats, food, servic...",Best Airline in the World,0
4,2024-03-10T05:34:18-04:00,Desktop,2,review,Premium Economy Seating on Singapore Airlines ...,Premium Economy Seating on Singapore Airlines ...,0


In [3]:
df = df.drop(columns=['type', 'published_date'])
df['published_platform'] = (df['published_platform'] == 'Desktop')
df['rating'] = (df['rating'] > 3) # будем считать негативными отзывами те, у которых рейтинг <= 3
df = df.dropna()

In [4]:
def preprocess(text):
    text = re.sub(r'[^\w\s]', '', text).lower()
    return text

In [5]:
df.head()

Unnamed: 0,published_platform,rating,text,title,helpful_votes
0,True,False,We used this airline to go from Singapore to L...,Ok,0
1,True,True,The service on Singapore Airlines Suites Class...,The service in Suites Class makes one feel lik...,0
2,True,False,"Booked, paid and received email confirmation f...",Don’t give them your money,0
3,True,True,"Best airline in the world, seats, food, servic...",Best Airline in the World,0
4,True,False,Premium Economy Seating on Singapore Airlines ...,Premium Economy Seating on Singapore Airlines ...,0


In [6]:
df['text_modified'] = df['title'] + ' ' + df['text']
df['text_modified'] = df['text_modified'].apply(preprocess)
df = df.dropna()
df.head()

Unnamed: 0,published_platform,rating,text,title,helpful_votes,text_modified
0,True,False,We used this airline to go from Singapore to L...,Ok,0,ok we used this airline to go from singapore t...
1,True,True,The service on Singapore Airlines Suites Class...,The service in Suites Class makes one feel lik...,0,the service in suites class makes one feel lik...
2,True,False,"Booked, paid and received email confirmation f...",Don’t give them your money,0,dont give them your money booked paid and rece...
3,True,True,"Best airline in the world, seats, food, servic...",Best Airline in the World,0,best airline in the world best airline in the ...
4,True,False,Premium Economy Seating on Singapore Airlines ...,Premium Economy Seating on Singapore Airlines ...,0,premium economy seating on singapore airlines ...


In [7]:
train, test = train_test_split(df, test_size=0.2, random_state=42)
bow = CountVectorizer()
train_x = bow.fit_transform(train['text_modified']).toarray()
test_x = bow.transform(test['text_modified']).toarray()
train_x.shape, test_x.shape

((7999, 21144), (2000, 21144))

In [8]:
train_y = train['rating']
test_y = test['rating']

In [9]:
model = LogisticRegression(random_state=42)
model.fit(train_x, train_y)
predicted = model.predict(test_x)
f1_score(test_y, predicted)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.943089430894309

# Medium

In [10]:
!pip install nltk



In [11]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.porter import *
 
nltk.download('stopwords', quiet=True)
nltk.download('punkt', quiet=True)

stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def stem_delete_stopwords(text):
    text = re.sub(r'[^\w\s]', '', text).lower()
    text = ' '.join(map(stemmer.stem, text.split(' ')))
    word_tokens = word_tokenize(text)
    filtered_text = [w for w in word_tokens if not w in stop_words]
    return ' '.join(filtered_text)

In [12]:
df['text_modified'] = df['text_modified'].apply(stem_delete_stopwords)
df.drop(inplace=True, columns=['text', 'title'])
df.head()

Unnamed: 0,published_platform,rating,helpful_votes,text_modified
0,True,False,0,ok use thi airlin go singapor london heathrow ...
1,True,True,0,servic suit class make one feel like vip servi...
2,True,False,0,dont give money book paid receiv email confirm...
3,True,True,0,best airlin world best airlin world seat food ...
4,True,False,0,premium economi seat singapor airlin worth mon...


In [13]:
train, test = train_test_split(df, test_size=0.2, random_state=42)
train.shape[0], test.shape[0]

(7999, 2000)

In [14]:
fit_results = []

x_train = train['text_modified']
x_test = test['text_modified']
y_train = train['rating']
y_test = test['rating']

for vectorizer in [CountVectorizer(), TfidfVectorizer()]:
    for model in [LogisticRegression(random_state=42), RandomForestClassifier(random_state=42), 
                  KNeighborsClassifier(), CatBoostClassifier(metric_period=100, random_state=42)]:
        pipeline = Pipeline(
            [
                ("vectorizer", vectorizer),
                ("model", model),
            ]
        )
        pipeline.fit(x_train, y_train)
        y_pred = pipeline.predict(x_test)
        metric = f1_score(y_pred, y_test)
        fit_results.append(
            {
                'vectorizer': vectorizer.__class__.__name__,
                'model': model.__class__.__name__,
                'f1': metric,
            }
        )

fit_results = pd.DataFrame(fit_results)

Learning rate set to 0.025034
0:	learn: 0.6777899	total: 78.1ms	remaining: 1m 18s
100:	learn: 0.3390364	total: 2.19s	remaining: 19.5s
200:	learn: 0.2848477	total: 4.29s	remaining: 17.1s
300:	learn: 0.2565835	total: 6.39s	remaining: 14.8s
400:	learn: 0.2359158	total: 8.46s	remaining: 12.6s
500:	learn: 0.2192603	total: 10.5s	remaining: 10.5s
600:	learn: 0.2070765	total: 12.6s	remaining: 8.39s
700:	learn: 0.1984068	total: 14.7s	remaining: 6.29s
800:	learn: 0.1895066	total: 16.8s	remaining: 4.17s
900:	learn: 0.1823852	total: 18.8s	remaining: 2.07s
999:	learn: 0.1757785	total: 20.9s	remaining: 0us
Learning rate set to 0.025034
0:	learn: 0.6760271	total: 44.6ms	remaining: 44.6s
100:	learn: 0.3299936	total: 3.97s	remaining: 35.4s
200:	learn: 0.2784480	total: 7.87s	remaining: 31.3s
300:	learn: 0.2506997	total: 11.9s	remaining: 27.5s
400:	learn: 0.2293231	total: 15.7s	remaining: 23.5s
500:	learn: 0.2103704	total: 19.9s	remaining: 19.8s
600:	learn: 0.1961157	total: 24.2s	remaining: 16.1s
700:	le

In [15]:
fit_results.sort_values('f1', ascending=False)

Unnamed: 0,vectorizer,model,f1
4,TfidfVectorizer,LogisticRegression,0.942237
3,CountVectorizer,CatBoostClassifier,0.941332
7,TfidfVectorizer,CatBoostClassifier,0.939526
0,CountVectorizer,LogisticRegression,0.937754
1,CountVectorizer,RandomForestClassifier,0.915601
5,TfidfVectorizer,RandomForestClassifier,0.913722
6,TfidfVectorizer,KNeighborsClassifier,0.895244
2,CountVectorizer,KNeighborsClassifier,0.854777


Tf-idf, вероятно, работает хорошо в силу того, что в текстах довольно много повторяющихся слов. Логистическая регрессия помогает выделить наиболее часто/редко встречающиеся, которые сильнее всего влияют на результат. CatBoost тоже хорошо умеет отделять значительные фичи от незначительных в силу своего сложного устройства.

Переберем несколько гиперпараметров лучшей модели.

In [16]:
X = df['text_modified']
y = df['rating']

solvers = ['newton-cg', 'lbfgs', 'liblinear']
penalty = ['l2']
c_values = [100, 10, 1.0, 0.1, 0.01]

grid = dict(solver=solvers,penalty=penalty,C=c_values)
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)

clf = make_pipeline(TfidfVectorizer(), 
                    GridSearchCV(estimator=LogisticRegression(random_state=42),
                                 param_grid=grid,
                                 cv=cv,
                                 n_jobs=-1,
                                 scoring='f1',
                                 error_score=0))

grid_result = clf.fit(X, y)['gridsearchcv']
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
    print("%f (%f) with: %r" % (mean, stdev, param))

Best: 0.942557 using {'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}
0.934872 (0.005965) with: {'C': 100, 'penalty': 'l2', 'solver': 'newton-cg'}
0.935040 (0.005247) with: {'C': 100, 'penalty': 'l2', 'solver': 'lbfgs'}
0.934872 (0.005957) with: {'C': 100, 'penalty': 'l2', 'solver': 'liblinear'}
0.942119 (0.006152) with: {'C': 10, 'penalty': 'l2', 'solver': 'newton-cg'}
0.942557 (0.005830) with: {'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}
0.942142 (0.006013) with: {'C': 10, 'penalty': 'l2', 'solver': 'liblinear'}
0.941242 (0.005341) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'newton-cg'}
0.941282 (0.005739) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'lbfgs'}
0.941330 (0.005374) with: {'C': 1.0, 'penalty': 'l2', 'solver': 'liblinear'}
0.896863 (0.005550) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'newton-cg'}
0.898000 (0.005797) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'lbfgs'}
0.898497 (0.005933) with: {'C': 0.1, 'penalty': 'l2', 'solver': 'liblinear'}
0.850029 (0.000233) with: {'

Лучше всех оказалась модель с дефолтными параметрами

In [17]:
def custom_test(to_score, clf):
    to_score = map(stem_delete_stopwords, to_score)
    return clf.predict(to_score)
    
model = grid_result.best_estimator_
clf = Pipeline(
            [
                ("vectorizer", TfidfVectorizer()),
                ("model", model),
            ]
        )
clf.fit(X, y)

to_score = ['The trip was amazing thanks to wonderful service of this aircompany', # positive
            'It was awful. The chairs where uncomfortable. The food was cold and completely uneatable, although I paid 1000$ for premium ticket.'] # negative

custom_test(to_score, clf)

array([ True, False])

Все верно

# Hard

Посмотрим на коэффициенты модели, которая она присваивала словам, и выберем самые "позитивные" и "негативные"

In [18]:
K = 20
coefs = clf[1].coef_.tolist()[0]
res_min = sorted(range(len(coefs)), key=lambda sub: coefs[sub])[:K]
res_max = sorted(range(len(coefs)), key=lambda sub: coefs[sub])[len(coefs)-K:]

In [19]:
vocab = clf[0].vocabulary_ # слову сопоставляется индекс
vocab = {y: x for x, y in vocab.items()} # меняем key и value местами, чтобы по индексу получать слово
for id in res_min:
    print(vocab[id])
print()
for id in res_max:
    print(vocab[id])

terribl
worst
disappoint
poor
uncomfort
averag
bad
ago
usualservic
limit
dont
disapoint
yet
old
sorri
horribl
basic
said
milk
aw

enjoy
easi
outstand
reliabl
perfect
wonder
profession
good
thank
love
pleasant
fantast
plenti
courteou
alway
comfort
amaz
great
best
excel


Идея: в позитивных отзывах попытаться использовать максимальное количество слов, которые наиболее сильно влияют на то, что текст будет отнесен к негативному классу и не использовать наиболее "позитивные" слова, и наоборот.

In [20]:
# negative
neg_reviews = [
    "Despite Singapore Airlines' reputation for reliability, my recent flight was marred by a noticeable lack of professionalism from the staff.",
    "Despite the excellence Singapore Airlines is known for, my recent experience fell short of the expected courtesy from staff, leaving much to be desired.",
    "The promise of a pleasant journey with Singapore Airlines was overshadowed by frequent delays and discomfort throughout the flight.",
    "In spite of Singapore Airlines' esteemed reputation, my recent journey was marred by unforeseen delays and a noticeable absence of professionalism from the crew, leaving much to be desired.",
    "While Singapore Airlines strives for greatness, my recent flight raised doubts about their reliability and overall quality of service."
]

custom_test(neg_reviews, clf)

array([ True,  True,  True,  True,  True])

In [21]:
# positive
pos_reviews = [
    "Flying with Singapore Airlines was a breath of fresh air compared to my previous terrible experiences with other airlines, the service was top-notch, leaving me pleasantly surprised.",
    "Despite my past disappointments with air travel, Singapore Airlines surpassed all expectations with their service and attention to detail.",
    "After enduring numerous uncomfortable flights in the past, I was hesitant to book with Singapore Airlines, however, their exceptional service and attention to detail made this journey far from average.",
    "Despite my worries, my flight with Singapore Airlines was anything but terrible; in fact, it was quite the opposite. The exceptional service surpassed my expectations and left me pleasantly surprised.",
    "After years of disappointing flights, I was surprised by Singapore Airlines' exceptional service."
]


custom_test(pos_reviews, clf)

array([False, False, False, False, False])