In [3]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Download necessary NLTK data
nltk.download('stopwords')
nltk.download('wordnet')

# Initialize stop words and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
# Load training data from file and preprocess text
def preprocess_data_and_text_from_file(filepath):
    with open(filepath, 'r') as file:
        data = file.read()
    lines = data.strip().split('\n')
    labels = []
    texts = []
    for line in lines:
        label, text = line.split(' ', 1)
        
        label = int(label.split('__label__')[1])

        text = text.lower()
        text = re.sub(r'[\W_]+', ' ', text)
        tokens = text.split()
        tokens = [word for word in tokens if word not in stop_words]
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
        text = ' '.join(tokens)

        labels.append(label)
        texts.append(text)
    return pd.DataFrame({'label': labels, 'text': texts})

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\admin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
train_filepath = 'train.3270.txt'
df_train = preprocess_data_and_text_from_file(train_filepath)

# Split data into features and labels
X = df_train['text']
y = df_train['label']

In [5]:
# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a method for TF-IDF vectorization
def apply_tfidf(X_train, X_val, max_features=None):
    tfidf_vectorizer = TfidfVectorizer(max_features=max_features, stop_words='english')
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
    X_val_tfidf = tfidf_vectorizer.transform(X_val)
    
    return X_train_tfidf, X_val_tfidf, tfidf_vectorizer

# Apply TF-IDF vectorization
X_train_tfidf, X_val_tfidf, tfidf_vectorizer = apply_tfidf(X_train, X_val, max_features=10000)

In [6]:
# Optional: Print the feature names to check
print("\nFeature names (TF-IDF):")
print(tfidf_vectorizer.get_feature_names_out())


Feature names (TF-IDF):
['00' '000' '00290' ... 'zune' 'zydeco' 'zzzzzzzzzz']


In [7]:
# Optional: Print the first row of the TF-IDF transformed data to check
print("\nFirst row of TF-IDF (training data):")
print(X_train_tfidf[0])


First row of TF-IDF (training data):
  (0, 4945)	0.09079259013672877
  (0, 4130)	0.15530066651714913
  (0, 9298)	0.08617394217254595
  (0, 7186)	0.12932075854994385
  (0, 5132)	0.13363228620606266
  (0, 2624)	0.1006139876021614
  (0, 2982)	0.15304677792952365
  (0, 3260)	0.10996941220802339
  (0, 5274)	0.0894409118909724
  (0, 9300)	0.09272410660580611
  (0, 1951)	0.10996941220802339
  (0, 8718)	0.11542081985817772
  (0, 2382)	0.13270479747446862
  (0, 8466)	0.10307954481532518
  (0, 329)	0.10829155042758754
  (0, 2211)	0.13363228620606266
  (0, 7130)	0.13363228620606266
  (0, 2086)	0.11414224192205502
  (0, 5695)	0.10334085058273854
  (0, 5449)	0.08929538570041912
  (0, 1243)	0.10048230930858489
  (0, 1270)	0.08188361391841693
  (0, 4807)	0.11921259183737265
  (0, 9572)	0.10733915602614436
  (0, 4740)	0.14571225548150182
  (0, 4506)	0.1769690468282356
  (0, 5276)	0.14418605296511966
  (0, 8089)	0.12779455603356168
  (0, 7230)	0.1716921634487071
  (0, 9102)	0.13560408876893063
  (0, 5

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Build and train the Logistic Regression model
model = LogisticRegression(max_iter=1000)
model.fit(X_train_tfidf, y_train)

# Evaluate the model on the validation set
y_val_pred = model.predict(X_val_tfidf)
print("\nValidation Accuracy: ", accuracy_score(y_val, y_val_pred))
print("\nClassification Report: ")
print(classification_report(y_val, y_val_pred))

# Function to predict sentiment for new texts
def predict_sentiment(texts, model, vectorizer):
    texts = [preprocess_text(text) for text in texts]
    texts = [' '.join(tokens) for tokens in texts]
    texts_tfidf = vectorizer.transform(texts)
    predictions = model.predict(texts_tfidf)
    return predictions

# Example prediction
new_texts = ["This is the best book I have ever read!", "The movie was too long and boring."]
predictions = predict_sentiment(new_texts, model, tfidf_vectorizer)
print("\nPredictions: ", predictions)