#### CLEANING DATA

In [None]:
import pandas as pd
import numpy as np
import cupy as cp
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder


nltk.download('stopwords')
stop_words = set(stopwords.words("english"))


df = pd.read_csv("newtrain.csv")


def clean_text(text):
    text = str(text).lower()  
    text = re.sub(r"http\S+|www\S+", "", text)  
    text = re.sub(r"\d+", "", text)  
    text = re.sub(r"[^\w\s]", "", text)  
    return text


df["cleaned_text"] = df["text"].apply(clean_text)
label_encoder = LabelEncoder()
df["sentiment"] = label_encoder.fit_transform(df["sentiment"])


vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=3000)
X = vectorizer.fit_transform(df["cleaned_text"]).toarray()
y = df["sentiment"].to_numpy()


X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.80, test_size=0.20, random_state=0)

# Move data to GPU
X_train = cp.array(X_train, dtype=cp.float64)  
X_test = cp.array(X_test, dtype=cp.float64)  
y_train = cp.array(y_train, dtype=cp.int64)  
y_test = cp.array(y_test, dtype=cp.int64)

### Traning using SVM

In [None]:
from cuml.svm import LinearSVC
from cuml.metrics import accuracy_score

svm_model = LinearSVC()
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
accuracy_svm = round(accuracy_score(y_test, y_pred_svm) * 100, 2)
print("SVM (GPU) Accuracy:", accuracy_svm)

### Converting the GPU trained model back to cpu based
- so that it can run on low end systems

In [None]:
from sklearn.svm import LinearSVC

def convert_cuml_to_sklearn(cuml_model):
    sklearn_model = LinearSVC(C=cuml_model.C)
    sklearn_model.coef_ = cuml_model.coef_.get()  
    sklearn_model.intercept_ = cuml_model.intercept_.get()

    return sklearn_model

cpu_model = convert_cuml_to_sklearn(svm_model)
print("Converted and saved as CPU-compatible model.")

### Saving the model

In [None]:
import pickle
# Save best model to a pickle file
filename = "svm_model.pkl"
with open(filename, "wb") as f:
    pickle.dump(cpu_model, f)

save_filename = "tfidf_vectorizer.pkl"
with open(save_filename, "wb") as f:
    pickle.dump(vectorizer, f)

print(f"Model and vectorizer saved to {save_filename}")