# Load Processed Data

In [None]:
imdb_dataset_processed = pd.read_csv('/kaggle/input/sentiments-processed/imdb_dataset_processed.csv')
print('imdb_dataset_processed done')
USAirline_dataset_processed =  pd.read_csv('/kaggle/input/sentiments-processed/USAirline_dataset_processed.csv')
print('USAirline_dataset_processed done')

In [None]:
#Now Check Nan value
print("imdb_dataset_processed:", imdb_dataset_processed.isnull().sum())
print("USAirline_dataset_processed:",USAirline_dataset_processed.isnull().sum())

print("Before check null", len(imdb_dataset_processed))
print("Before check null", len(USAirline_dataset_processed))
print("")

imdb_dataset_processed = imdb_dataset_processed.dropna()
USAirline_dataset_processed = USAirline_dataset_processed.dropna()

print("After drop null", len(imdb_dataset_processed))
print("After drop null", len(USAirline_dataset_processed))
print("")
#Now Check Nan value
print("AFTER imdb_dataset_processed:", imdb_dataset_processed.isnull().sum())
print("AFTER USAirline_dataset_processed:",USAirline_dataset_processed.isnull().sum())

# Data reLabel:  LabelEncoder

In [None]:
from sklearn.preprocessing import LabelEncoder
# imdb_dataset, USAirline_dataset, and Sentiment140_dataset

# Initialize the LabelEncoder
le = LabelEncoder()
# Label Encoding for each dataset
#imdb_dataset['sentiment'], ['sentiment'] CONTAINS LABELS
encoded_label_imdb = le.fit_transform(imdb_dataset_processed['sentiment']) 
encoded_label_USAirline = le.fit_transform(USAirline_dataset_processed['sentiment'])

# feature extraction

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

# Instantiate each vectorizer individually
tfidf_vectorizer = TfidfVectorizer()
count_vectorizer = CountVectorizer()

# Vectorize the 'review' column for each dataset with TF-IDF
imdb_dataset_tfidf = tfidf_vectorizer.fit_transform(imdb_dataset_processed['review_P'])
imdb_dataset_CountVectorizer = count_vectorizer.fit_transform(imdb_dataset_processed['review_P'])
print('\n------------->imdb_dataset: I AM DONE<-------------------')

USAirline_dataset_tfidf = tfidf_vectorizer.fit_transform(USAirline_dataset_processed['review_P'])
USAirline_dataset_CountVectorizer = count_vectorizer.fit_transform(USAirline_dataset_processed['review_P'])
print('\n------------->USAirline_dataset: I AM DONE<-------------------')

# Handling Imbalanced Datasets

In [None]:
from imblearn.over_sampling import SMOTE

# Initialize SMOTE
smote = SMOTE() #SMOTE(random_state=42)
#imdb_dataset is balanced
###########################################################################################################################################
X_train_usairline_tfidf, y_train_usairline_tfidf = smote.fit_resample(USAirline_dataset_tfidf, encoded_label_USAirline)
X_train_usairline_CountVectorizer, y_train_usairline_CountVectorizer = smote.fit_resample(USAirline_dataset_CountVectorizer, encoded_label_USAirline)
print('\n------------->USAirline_dataset_fidf: I AM DONE<-------------------')

# Check the class distribution after smote

In [None]:
from collections import Counter

# Checking class distribution in y_train_usairline_tfidf
class_distribution_tfidf = Counter(y_train_usairline_tfidf)
print("Class distribution in USAirline_dataset_tfidf after SMOTE:")
for k, v in class_distribution_tfidf.items():
    print(f"Class {k}: {v} instances")

# Checking class distribution in y_train_usairline_CountVectorizer
class_distribution_cv = Counter(y_train_usairline_CountVectorizer)
print("\nClass distribution in USAirline_dataset_CountVectorizer after SMOTE:")
for k, v in class_distribution_cv.items():
    print(f"Class {k}: {v} instances")

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression, PassiveAggressiveClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# Datasets and their labels (assuming these are defined somewhere in your code)
datasets = {
    "imdb_tfidf": (imdb_dataset_tfidf, encoded_label_imdb),
    "imdb_count": (imdb_dataset_CountVectorizer, encoded_label_imdb),
    "usairline_tfidf": (X_train_usairline_tfidf, y_train_usairline_tfidf),
    "usairline_count": (X_train_usairline_CountVectorizer, y_train_usairline_CountVectorizer),
}

# Splitting function
def split_dataset(X, y, test_size=0.2, random_state=0):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state)
    return X_train, X_test, y_train, y_test

# Splitting datasets
split_datasets = {}
for name, (X, y) in datasets.items():
    X_train, X_test, y_train, y_test = split_dataset(X, y)
    split_datasets[name] = {
        "X_train": X_train,
        "X_test": X_test,
        "y_train": y_train,
        "y_test": y_test
    }

**Dynamically selection**

In [None]:
# Dynamically selecting a dataset
selected_dataset = "imdb_tfidf" # you can choose any dataset from datasets

# Apply FE + ML Models on each datasets 

In [None]:
X_train = split_datasets[selected_dataset]["X_train"]
X_test = split_datasets[selected_dataset]["X_test"]
y_train = split_datasets[selected_dataset]["y_train"]
y_test = split_datasets[selected_dataset]["y_test"]

# Extracting dataset name and transformation for dynamic updates
dataset_name, transformation = selected_dataset.split('_')
formatted_dataset_name = dataset_name.upper()
formatted_transformation = transformation.upper()

# Models dictionary
models = {
    'Logistic Regression': LogisticRegression(random_state=0),
    'SVM': CalibratedClassifierCV(LinearSVC(random_state=0), cv=10),
    'Passive_Aggressive': PassiveAggressiveClassifier(random_state=0),
    'RandomForest': RandomForestClassifier(random_state=0),
    'AdaBoost': AdaBoostClassifier(random_state=0),
    'MultinomialNB': MultinomialNB(),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=0)
}

# Results dictionary
results = {}

# Training, prediction, and evaluation function
def train_predict_evaluate(model, X_train, X_test, y_train, y_test):
    try:
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        accuracy = accuracy_score(y_test, predictions)
        class_report = classification_report(y_test, predictions, digits=4)
        return accuracy, class_report
    except Exception as e:
        return str(e), str(e)

# Evaluating models
print(f"Evaluating models on {formatted_dataset_name} dataset with {formatted_transformation} transformation")
for model_name, model in models.items():
    accuracy, class_report = train_predict_evaluate(model, X_train, X_test, y_train, y_test)
    results[model_name] = (accuracy, class_report)
    print(f"\nModel: {model_name} ({formatted_transformation})")
    print(f"Accuracy: {accuracy:.4f}")
    print("Classification Report:")
    print(class_report)
        
# Dynamically setting the output filename based on the selected dataset
output_filename = f"/kaggle/working/model_for_{dataset_name}_{transformation}.txt"

with open(output_filename, "w") as file:
    file.write(f"Evaluating models on {formatted_dataset_name} dataset with {formatted_transformation} transformation\n")
    for model_name, (accuracy, class_report) in results.items():
        file.write(f"\nModel: {model_name} ({formatted_transformation})\n")
        file.write(f"Accuracy: {accuracy:.4f}\n")
        file.write("Classification Report:\n")
        file.write(f"{class_report}\n")