In [7]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report
import pandas as pd 

def logistic_regression(df):
    # 1. Split the data
    X_train, X_test, y_train, y_test = train_test_split(df["text"], df["label"], test_size=0.2, random_state=42, stratify=df["label"])

    # 2. Convert text to numerical features using TF-IDF
    vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
    X_train_tfidf = vectorizer.fit_transform(X_train)
    X_test_tfidf = vectorizer.transform(X_test)

    # 3. Train logistic regression
    model = LogisticRegression(max_iter=1000)
    model.fit(X_train_tfidf, y_train)

    # 4. Predict and evaluate
    y_pred = model.predict(X_test_tfidf)
    print(classification_report(y_test, y_pred))



#### Logistic Regression - Scikit Library

- starbucks.csv
- starbucks2.csv 
- starbucks.csv + starbucks2.csv
- dunkin.csv 
- starbucks.csv + starbucks2.csv + dunkin.csv 

In [8]:
df1 = pd.read_csv('starbucks.csv')
df1.drop_duplicates(subset=['text'], keep='last', inplace=True)
df1.dropna(subset=['text','label'], inplace=True)
df1.reindex()
logistic_regression(df1)

              precision    recall  f1-score   support

    negative       0.86      0.33      0.48        18
     neutral       0.65      0.95      0.77        42
    positive       0.83      0.56      0.67        27

    accuracy                           0.70        87
   macro avg       0.78      0.61      0.64        87
weighted avg       0.75      0.70      0.68        87



In [9]:
df2 = pd.read_csv('starbucks2.csv')
df2.drop_duplicates(subset=['text'], keep='last', inplace=True)
df2.dropna(subset=['text','label'], inplace=True)
df2.reindex()
logistic_regression(df2)

              precision    recall  f1-score   support

    negative       0.62      0.58      0.60        36
     neutral       0.67      0.32      0.43        25
    positive       0.56      0.78      0.65        40

    accuracy                           0.59       101
   macro avg       0.62      0.56      0.56       101
weighted avg       0.61      0.59      0.58       101



In [10]:
starbucks = pd.concat([df1, df2], ignore_index=True)
logistic_regression(starbucks)

              precision    recall  f1-score   support

    negative       0.89      0.57      0.70        54
     neutral       0.49      0.52      0.51        67
    positive       0.55      0.67      0.60        67

    accuracy                           0.59       188
   macro avg       0.64      0.59      0.60       188
weighted avg       0.63      0.59      0.60       188



In [11]:
dunkin = pd.read_csv('dunkin.csv')
dunkin.drop_duplicates(subset=['text'], keep='last', inplace=True)
dunkin.dropna(subset=['text','label'], inplace=True)
dunkin.reindex()
logistic_regression(dunkin)

              precision    recall  f1-score   support

    negative       0.33      0.05      0.08        22
     neutral       0.59      0.38      0.46        42
    positive       0.47      0.84      0.61        44

    accuracy                           0.50       108
   macro avg       0.47      0.42      0.38       108
weighted avg       0.49      0.50      0.44       108



In [12]:
df = pd.concat([df1, df2, dunkin], ignore_index=True)
logistic_regression(df)

              precision    recall  f1-score   support

    negative       0.65      0.32      0.42        76
     neutral       0.54      0.61      0.58       109
    positive       0.56      0.68      0.62       111

    accuracy                           0.56       296
   macro avg       0.58      0.54      0.54       296
weighted avg       0.58      0.56      0.55       296

