In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
from lightgbm import LGBMClassifier

# Read the CSV file into a DataFrame
tweet_data_int = pd.read_csv('train_nick.csv')

# Convert 'tweet' column to string data type and remove non-string entries
tweet_data_int['tweet'] = tweet_data_int['tweet'].astype(str)
tweet_data_int = tweet_data_int[tweet_data_int['tweet'].apply(lambda x: isinstance(x, str))]

# Join the list of words into a single string
tweet_data_int['tweet'] = tweet_data_int['tweet'].apply(lambda x: ''.join(x))

# Split the data into features (X) and target (y)
X = tweet_data_int['tweet']
y = tweet_data_int['sentiment']

# Split the data into training, validation, and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.25, random_state=42)

print(X_train.head())
print(y_train.head())

11888    one thing I m amazed at be how close defender ...
22430                                              aiiight
49665    number 1 and only number 1 if you consider any...
6277       a powerful and insightful portrait of a uniq...
26297                follow my favorite youtuber jorraptor
Name: tweet, dtype: object
11888    2
22430    2
49665    3
6277     2
26297    0
Name: sentiment, dtype: int64


In [None]:
# Create a pipeline with TF-IDF vectorizer and LightGBM classifier
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('lgbm', LGBMClassifier())
])

# Define the parameter grid for grid search
param_grid = {
    'tfidf__max_features': [1000, 2000],
    'lgbm__num_leaves': [20, 30],
    'lgbm__learning_rate': [0.05, 0.1],
    'lgbm__n_estimators': [100, 200]
}

# Perform grid search to find the best parameters
grid_search = GridSearchCV(pipeline, param_grid, cv=5, scoring='accuracy')
grid_search.fit(X_train, y_train)

In [None]:
# Get the best model and its parameters
best_model = grid_search.best_estimator_
best_params = grid_search.best_params_

# Fit the best model on the training data
best_model.fit(X_train, y_train)

# Get predictions on the test set
y_pred = best_model.predict(X_test)

# Calculate accuracy for the best model
accuracy = accuracy_score(y_test, y_pred)

# Compare accuracy before and after tuning
y_pred_initial = best_model.predict(X_test)
accuracy_initial = accuracy_score(y_test, y_pred_initial)

# Print accuracy and best parameters
print("Accuracy (Best Model):", accuracy)
print("Best Parameters:", best_params)
print("Accuracy (Initial Model):", accuracy_initial)

# Print classification report
print(classification_report(y_test, y_pred))

# Quantitative Results
results = pd.DataFrame({'Metric': ['Accuracy', 'Mean Accuracy'],
                        'Value': [accuracy, np.mean(y_test == y_pred)]})
print("Quantitative Results:")
print(results)

# Learning Curves
train_scores = []
val_scores = []
param_range = np.arange(1, len(X_train)+1, 100)
for size in param_range:
    X_train_subset = X_train[:size]
    y_train_subset = y_train[:size]

    best_model.fit(X_train_subset, y_train_subset)

    train_pred = best_model.predict(X_train_subset)
    train_acc = accuracy_score(y_train_subset, train_pred)
    train_scores.append(train_acc)

    val_pred = best_model.predict(X_val)
    val_acc = accuracy_score(y_val, val_pred)
    val_scores.append(val_acc)

# Plotting learning curves
plt.figure(figsize=(10, 6))
plt.plot(param_range, train_scores, label='Training Accuracy')
plt.plot(param_range, val_scores, label='Validation Accuracy')
plt.xlabel('Training Set Size')
plt.ylabel('Accuracy')
plt.title('Learning Curves')
plt.legend()
plt.grid(True)
plt.show()