In [4]:
!pip install pandas numpy scikit-learn nltk joblib



In [1]:
!pip install joblib

import joblib
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.metrics import classification_report

from download import preprocess_text

test_data = pd.read_csv("twitter_validation.csv")
print(test_data.shape)
print(test_data.columns.tolist())
test_data.columns = [f"col_{i}" for i in range(test_data.shape[1])]
test_data = test_data.rename(columns={
    f"col_{3}": "text",
    f"col_{2}": "sentiment"
})

test_data["cleaned_text"] = test_data["text"].apply(preprocess_text)
print(test_data.columns.tolist())


loaded_model = joblib.load("sentiment_model.pkl")
loaded_tfidf = joblib.load("tfidf_vectorizer.pkl")

X_test = loaded_tfidf.transform(test_data["cleaned_text"])

test_pred = loaded_model.predict(X_test)
test_prob = loaded_model.predict_proba(X_test)

test_data["predicted_sentiment"] = test_pred
test_data["prediction_confidence"] = np.max(test_prob, axis=1)

def model_evaluation(y_true, y_pred, model_name="Sentiment Model"):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average="weighted", zero_division=0)
    recall = recall_score(y_true, y_pred, average="weighted", zero_division=0)

    print(f"Accuracy: {accuracy:.4f} ({accuracy*100:.2f})")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(classification_report(y_true, y_pred , target_names=['Irrelevant', 'Negative', 'Neutral', 'Positive'], zero_division=0))

    return {
        "accuracy": accuracy,
        "precision": precision,
        "recall": recall,
    }

def analyze_predictions(df, text_col="text", pred_col='predicted_sentiment', true_col=None, confidence_col='prediction_confidence'):
    print("Analyze: \n")
    top_confident = df.nlargest(10, confidence_col)
    for id, row in top_confident.iterrows():
        true_labels = row[true_col] if true_col else "N/A"
        print(f"{row[text_col][:80]}... -> Predict: {row[pred_col]}, "
              f"True: {true_labels}, Confidence: {row[confidence_col]:.4f}")

    low_confident = df.nsmallest(10, confidence_col)
    for idx, row in low_confident.iterrows():
        true_labels = row[true_col] if true_col else "N/A"
        print(f"{row[text_col][:80]}... -> Predict: {row[pred_col]}, "
              f"True: {true_labels}, Confidence: {row[confidence_col]:.4f}")

    pred_distribution = df[pred_col].value_counts().sort_index()
    for class_label, count in pred_distribution.items():
        percentage = (count / len(df)) * 100
        print(f"Class {class_label}: {count} examples ({percentage:.1f}%)")

if "sentiment" in test_data.columns:
    true_labels = test_data["sentiment"]
    metrics = model_evaluation(true_labels, test_pred)
    analyze_predictions(test_data, true_col="sentiment")
else:
    print(test_data["prediction_confidence"].describe())
    analyze_predictions(test_data)

print("\n")
for index, row in test_data.head(20).iterrows():
    print(f"{row['text']} -> {row['predicted_sentiment']}")

Train_data shape:  (74682, 4)
      0            1         2  \
0  2401  Borderlands  Positive   
1  2401  Borderlands  Positive   
2  2401  Borderlands  Positive   
3  2401  Borderlands  Positive   
4  2401  Borderlands  Positive   

                                                   3  
0  im getting on borderlands and i will murder yo...  
1  I am coming to the borders and I will kill you...  
2  im getting on borderlands and i will kill you ...  
3  im coming on borderlands and i will murder you...  
4  im getting on borderlands 2 and i will murder ...  
row 0: [np.int64(2401), 'Borderlands', 'Positive', 'im getting on borderlands and i will murder you all ,']
row 1: [np.int64(2401), 'Borderlands', 'Positive', 'I am coming to the borders and I will kill you all,']
row 2: [np.int64(2401), 'Borderlands', 'Positive', 'im getting on borderlands and i will kill you all,']
sentiment
Negative      22542
Positive      20832
Neutral       18318
Irrelevant    12990
Name: count, dtype: int64


In [None]:
import pandas as pd
import numpy as np
import joblib

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report

train_data = pd.read_csv("twitter_training.csv", header=None)
print("Train_data shape: ", train_data.shape)
print(train_data.head())

for i in range(3):
    print(f"row {i}: {train_data.iloc[i].tolist()}")
text_column = 3
sentiment_column = 2
train_data.columns = [f"col_{i}" for i in range(train_data.shape[1])]
train_data = train_data.rename(columns={
    f"col_{text_column}": "text",
    f"col_{sentiment_column}": "sentiment"
})

print(train_data["sentiment"].value_counts())

import nltk
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('omw-1.4')

def preprocess_text(text):
    if not isinstance(text, str):
        if pd.isna(text):
            return ""
        text = str(text)

    text = re.sub(r'[^a-zA-Z\s]', '', text, flags=re.I|re.A)
    text = text.lower().strip()
    stop_words = set(stopwords.words('english'))
    tokens = text.split()
    tokens = [t for t in tokens if t not in stop_words]
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(t) for t in tokens]

    return ' '.join(tokens)

train_data["cleaned_data"] = train_data["text"].apply(preprocess_text)
print(train_data.columns.tolist())

tfidf = TfidfVectorizer(
    max_features=5000,
    stop_words='english',
    ngram_range=(1, 2)
)

X_train = tfidf.fit_transform(train_data["cleaned_data"])
y_train = train_data["sentiment"]

print(f"TF-IDF matrix shape: {X_train.shape}")

model = LogisticRegression(
    random_state=42,
    max_iter=1000,
    class_weight="balanced"
)
model.fit(X_train, y_train)
train_predictions = model.predict(X_train)
train_accuracy = accuracy_score(y_train, train_predictions)
print(f"Train Accuracy: {train_accuracy:.4f}")
print(classification_report, train_predictions)

joblib.dump(model, 'sentiment_model.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')
print("save")