# Дз 6
## Easy

In [105]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import f1_score
import warnings

warnings.filterwarnings(action='ignore')

In [106]:
df = pd.read_csv('../data/singapore_airlines_reviews.csv')
df.head()

Unnamed: 0,published_date,published_platform,rating,type,text,title,helpful_votes
0,2024-03-12T14:41:14-04:00,Desktop,3,review,We used this airline to go from Singapore to L...,Ok,0
1,2024-03-11T19:39:13-04:00,Desktop,5,review,The service on Singapore Airlines Suites Class...,The service in Suites Class makes one feel lik...,0
2,2024-03-11T12:20:23-04:00,Desktop,1,review,"Booked, paid and received email confirmation f...",Don’t give them your money,0
3,2024-03-11T07:12:27-04:00,Desktop,5,review,"Best airline in the world, seats, food, servic...",Best Airline in the World,0
4,2024-03-10T05:34:18-04:00,Desktop,2,review,Premium Economy Seating on Singapore Airlines ...,Premium Economy Seating on Singapore Airlines ...,0


In [107]:
df.drop(columns=['type'], inplace=True)


def clean_text(text):
    text = str(text)
    text = re.sub(r'[().,?!-:]', '', text)
    return text

In [108]:
df['title'] = df['title'].apply(clean_text)
df['text'] = df['text'].apply(clean_text)
df['all text'] = df['title'] + ' ' + df['text']
# Сделаем рейтинг бинарным: 1 - это оценки 3, 4, 5; 0 - это 1, 2
df['rating_bin'] = df['rating'].apply(lambda x: int(x > 3))
new_df = df[['all text', 'rating_bin']]
new_df.head()

Unnamed: 0,all text,rating_bin
0,Ok We used this airline to go from Singapore t...,0
1,The service in Suites Class makes one feel lik...,1
2,Don’t give them your money Booked paid and rec...,0
3,Best Airline in the World Best airline in the ...,1
4,Premium Economy Seating on Singapore Airlines ...,0


In [109]:
train_reviews, test_reviews = train_test_split(df, random_state=42)
evaluation_results = []
text_features = ['all text', 'title', 'text']
model = LogisticRegression()
vectorizer_list = [CountVectorizer(), TfidfVectorizer()]
for vectorizer in vectorizer_list:
    for feature in text_features:
        X_train = vectorizer.fit_transform(train_reviews[feature])
        X_test = vectorizer.transform(test_reviews[feature])
        y_train = train_reviews['rating_bin']
        y_test = test_reviews['rating_bin']

        model_instance = model
        model_instance.fit(X_train, y_train)
        predictions = model_instance.predict(X_test)

        f1 = f1_score(predictions, y_test, average="weighted")
        evaluation_results.append((feature, model.__class__.__name__, vectorizer.__class__.__name__, f1))

results_df = pd.DataFrame(evaluation_results, columns=['Feature', 'Model', 'Vectorizer', 'F1 Score'])
results_df = results_df.sort_values(by='F1 Score', ascending=False)
results_df

Unnamed: 0,Feature,Model,Vectorizer,F1 Score
3,all text,LogisticRegression,TfidfVectorizer,0.913585
0,all text,LogisticRegression,CountVectorizer,0.907692
5,text,LogisticRegression,TfidfVectorizer,0.901082
2,text,LogisticRegression,CountVectorizer,0.898028
1,title,LogisticRegression,CountVectorizer,0.868887
4,title,LogisticRegression,TfidfVectorizer,0.868467


#### Лучший результат

In [110]:
best_result = results_df.iloc[0]
best_result

Feature                 all text
Model         LogisticRegression
Vectorizer       TfidfVectorizer
F1 Score                0.913585
Name: 3, dtype: object

## Medium

In [111]:
import spacy
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
from nltk import PorterStemmer
from nltk.corpus import stopwords
import nltk

In [112]:
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
nlp = spacy.load('en_core_web_sm')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lizag\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


#### Удаление стоп-слов, стемминг, лемматизация

In [113]:
def preprocess_text(text):
    words = [word for word in text.split() if word.lower() not in stop_words]
    stem_words = [stemmer.stem(word) for word in words]
    doc = nlp(" ".join(stem_words))
    lem_words = [token.lemma_ for token in doc]
    return " ".join(lem_words)


df['processed_text'] = df['text'].apply(preprocess_text)
df['processed_text'].head()

0    use airlin go singapor london heathrow issu ti...
1    servic singapor airlin suit class noth excel c...
2    book pay receiv email confirm extra legroom se...
3    good airlin world seat food servic brilliant c...
4    premium economi seat singapor airlin narrow se...
Name: processed_text, dtype: object

In [114]:
train_reviews, test_reviews = train_test_split(df, random_state=42)

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_reviews['processed_text'])
X_test = vectorizer.transform(test_reviews['processed_text'])
y_train = train_reviews['rating']
y_test = test_reviews['rating']
evaluation_results = []

#### Модели и параметры

In [115]:
models = {
    'LogisticRegression': LogisticRegression(),
    'SVC': SVC(),
    'RandomForestClassifier': RandomForestClassifier()
}

params = {
    'LogisticRegression': {'C': [1, 3, 10]},
    'SVC': {'C': [1, 3, 10], 'kernel': ['linear', 'poly', 'rbf']},
    'RandomForestClassifier': {'n_estimators': [5, 50, 100]}
}

In [116]:
for model_name in models:
    model = models[model_name]
    param_grid = params[model_name]

    grid_search = GridSearchCV(model, param_grid, cv=3, scoring='f1_weighted')
    grid_search.fit(X_train, y_train)

    best_model = grid_search.best_estimator_
    predictions = best_model.predict(X_test)

    f1 = f1_score(predictions, y_test, average="weighted")
    evaluation_results.append((model_name, best_model.get_params(), f1))

res_df = pd.DataFrame(evaluation_results, columns=['Model', 'Best parameters', 'F1 Score'])
res_df = res_df.sort_values(by='F1 Score', ascending=False)
res_df

Unnamed: 0,Model,Best parameters,F1 Score
1,SVC,"{'C': 1, 'break_ties': False, 'cache_size': 20...",0.701027
0,LogisticRegression,"{'C': 3, 'class_weight': None, 'dual': False, ...",0.67026
2,RandomForestClassifier,"{'bootstrap': True, 'ccp_alpha': 0.0, 'class_w...",0.582248


#### Лучший результат

In [117]:
best_res = res_df.iloc[0]
best_res

Model                                                            SVC
Best parameters    {'C': 1, 'break_ties': False, 'cache_size': 20...
F1 Score                                                    0.701027
Name: 1, dtype: object

#### Тестирование модели

In [125]:
best_model_name = best_res['Model']
best_model_params = best_res['Best parameters']

best_model = models[best_model_name]
best_model.set_params(**best_model_params)
best_model.fit(X_train, y_train)

new_reviews = [
    "Well-placed signs at the airport, delicious drinks on board, friendly staff",
    "Disgusting food and terribly long wait for an stewardess"
]

new_reviews_processed = [preprocess_text(review) for review in new_reviews]
new_reviews_vectorized = vectorizer.transform(new_reviews_processed)
predictions_new_reviews = best_model.predict(new_reviews_vectorized)

for review, rating in zip(new_reviews, predictions_new_reviews):
    print(f"Отзыв: {review}. Предсказанный рейтинг: {rating}")

Отзыв: Well-placed signs at the airport, delicious drinks on board, friendly staff. Предсказанный рейтинг: 5
Отзыв: Disgusting food and terribly long wait for an stewardess. Предсказанный рейтинг: 1
