In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import re

In [None]:
# Function to preprocess data from text file
def preprocess_data_from_file(filepath):
    with open(filepath, 'r') as file:
        data = file.read()
    lines = data.strip().split('\n')
    labels = []
    texts = []
    for line in lines:
        label, text = line.split(' ', 1)
        label = int(label.split('__label__')[1])
        labels.append(label)
        texts.append(text)
    return pd.DataFrame({'label': labels, 'text': texts})

In [None]:
# Load training data from file
train_filepath = 'train.3270.txt'
df_train = preprocess_data_from_file(train_filepath)

In [None]:
df_train.head(10)

In [None]:
# Preprocess text
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    return text

In [None]:
df_train['text'] = df_train['text'].apply(preprocess_text)

In [None]:
df_train.head(10)

In [None]:
# Split training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(df_train['text'], df_train['label'], test_size=0.2, random_state=42)

In [None]:
# Feature engineering
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)

In [None]:
# Show the first text converted to numerical form
first_text = X_train.iloc[0]
first_text_tfidf = vectorizer.transform([first_text])
print(f"First text: {first_text}")
print(f"First text TF-IDF features: {first_text_tfidf}")

In [None]:
# Get the feature names (terms) from the TF-IDF vectorizer
feature_names = vectorizer.get_feature_names_out()

# Get the term corresponding to the index 14256
term = feature_names[14256]
print(f"Term corresponding to index 14256: {term}")

In [None]:
# Display the terms with their corresponding TF-IDF scores for the first text
print("Terms and TF-IDF scores for the first text:")
for index, score in zip(first_text_tfidf.indices, first_text_tfidf.data):
    term = feature_names[index]
    print(f"({index}, {term})\t{score}")

In [None]:
# Model selection and training
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

In [None]:
# Model evaluation on validation set
y_val_pred = model.predict(X_val_tfidf)
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("Validation Classification Report:\n", classification_report(y_val, y_val_pred))

In [None]:
# Load test data from file
test_filepath = 'test.135.txt'
df_test = preprocess_data_from_file(test_filepath)

# Apply text preprocessing
df_test['text'] = df_test['text'].apply(preprocess_text)

# Feature engineering
X_test_tfidf = vectorizer.transform(df_test['text'])

# Model evaluation on test set
y_test_pred = model.predict(X_test_tfidf)
y_test_actual = df_test['label'].to_numpy()
print("Test Accuracy:", accuracy_score(y_test_actual, y_test_pred))
print("Test Classification Report:\n", classification_report(df_test['label'], y_test_pred))

In [None]:
# Function to predict sentiment for a custom input text
def predict_custom_text(text):
    text = preprocess_text(text)
    X = vectorizer.transform([text])
    prediction = model.predict(X)
    return prediction[0]

# Example custom input text
custom_text = "I absolutely love this product! It's fantastic and works great."
predicted_label = predict_custom_text(custom_text)
print(f"Custom Text Prediction: {predicted_label}")

In [None]:
from flask import Flask, request, jsonify

# Deployment example (Flask app)
app = Flask(__name__)

@app.route('/predict', methods=['POST'])
def predict():
    text = request.json['text']
    text = preprocess_text(text)
    X = vectorizer.transform([text])
    prediction = model.predict(X)
    return jsonify({'prediction': int(prediction[0])})

if __name__ == '__main__':
    app.run(debug=True)

### Testing zone

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

text = "Natural Language Processing is an exciting field!"
tokens = word_tokenize(text.lower())
tokens = [t for t in tokens if t.isalpha()]
stop_words = set(stopwords.words('english'))
tokens = [t for t in tokens if t not in stop_words]
lemmatizer = WordNetLemmatizer()
tokens = [lemmatizer.lemmatize(t) for t in tokens]

print(tokens)  # Output: ['natural', 'language', 'processing', 'exciting', 'field']


#### Step-by-Step Data Preprocessing

In [None]:
# 1. Import libraries
import pandas as pd
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


In [None]:
# 2. Load Data
def preprocess_data_from_file(filepath):
    with open(filepath, 'r') as file:
        data = file.read()
    lines = data.strip().split('\n')
    labels = []
    texts = []
    for line in lines:
        label, text = line.split(' ', 1)
        label = int(label.split('__label__')[1])
        labels.append(label)
        texts.append(text)
    return pd.DataFrame({'label': labels, 'text': texts})

train_filepath = 'train.3270.txt'
df_train = preprocess_data_from_file(train_filepath)

In [None]:
# 3. Preprocessing Functions

# Initialize stop words, stemmer, and lemmatizer
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess_text(text, method='lemmatize'):
    # Lowercase
    text = text.lower()
    
    # Remove punctuation and special characters
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stop words
    tokens = [word for word in tokens if word not in stop_words]
    
    # Stemming or Lemmatization
    if method == 'stem':
        tokens = [stemmer.stem(word) for word in tokens]
    elif method == 'lemmatize':
        tokens = [lemmatizer.lemmatize(word) for word in tokens]
    
    return ' '.join(tokens)

# Apply text preprocessing
df_train['text'] = df_train['text'].apply(lambda x: preprocess_text(x, method='lemmatize'))


In [None]:
# 4. Handling Imbalanced Data

# Before handling imbalanced data, let's check the distribution of labels.
print(df_train['label'].value_counts())

In [None]:
# If the data is imbalanced, we can use SMOTE to oversample the minority class.

# Split data into features and labels
X = df_train['text']
y = df_train['label']

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english')
X_train_tfidf = vectorizer.fit_transform(X_train)
X_val_tfidf = vectorizer.transform(X_val)

# Handle imbalanced data using SMOTE
smote = SMOTE(random_state=42)
X_train_tfidf_resampled, y_train_resampled = smote.fit_resample(X_train_tfidf, y_train)

In [None]:
# 5. Model Training and Evaluation

# Model selection and training
model = LogisticRegression()
model.fit(X_train_tfidf_resampled, y_train_resampled)

# Model evaluation on validation set
y_val_pred = model.predict(X_val_tfidf)
print("Validation Accuracy:", accuracy_score(y_val, y_val_pred))
print("Validation Classification Report:\n", classification_report(y_val, y_val_pred))
