In [19]:
import csv
import pandas as pd
from io import StringIO
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report
import nltk
import re

nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Define a function to preprocess text
def preprocess_text(text):
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'\@\w+|\#', '', text)  # Remove mentions and hashtags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Keep only alphabets and spaces
    text = text.lower().split()  # Lowercase and split into words
    stops = set(stopwords.words("english"))  # English stopwords
    text = [WordNetLemmatizer().lemmatize(w) for w in text if not w in stops]  # Lemmatize and remove stopwords
    return " ".join(text)

# Manually handle CSV reading
def clean_csv(file_path):
    data = []
    with open(file_path, 'r', encoding='ISO-8859-1') as file:
        reader = csv.reader(file)
        for row in reader:
            try:
                # Only accept rows that have the expected number of columns
                if len(row) == 6:
                    data.append([row[0], row[5]])  # Assuming sentiment is in the first column and text in the sixth
            except csv.Error:
                continue
    return data

# Load and preprocess the data
data = clean_csv('/content/training.1600000.processed.noemoticon.csv')
df = pd.DataFrame(data, columns=['sentiment', 'text'])
df['sentiment'] = df['sentiment'].replace('4', '1')  # Normalize sentiment values
df['text'] = df['text'].apply(preprocess_text)  # Apply preprocessing

# Feature Extraction
vectorizer = TfidfVectorizer(max_features=1000)
X = vectorizer.fit_transform(df['text'])
y = df['sentiment'].astype(int)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train models
gbm = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3)
ada = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1), n_estimators=100, learning_rate=0.1)
gbm.fit(X_train, y_train)
ada.fit(X_train, y_train)

# Evaluate models
gbm_preds = gbm.predict(X_test)
ada_preds = ada.predict(X_test)

print("Gradient Boosting Machine Performance:")
print(classification_report(y_test, gbm_preds))
print("AdaBoost Performance:")
print(classification_report(y_test, ada_preds))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Gradient Boosting Machine Performance:
              precision    recall  f1-score   support

           0       0.79      0.51      0.62    159494
           1       0.64      0.87      0.73    160506

    accuracy                           0.69    320000
   macro avg       0.71      0.69      0.68    320000
weighted avg       0.71      0.69      0.68    320000

AdaBoost Performance:
              precision    recall  f1-score   support

           0       0.76      0.42      0.54    159494
           1       0.60      0.87      0.71    160506

    accuracy                           0.65    320000
   macro avg       0.68      0.65      0.63    320000
weighted avg       0.68      0.65      0.63    320000



In [None]:
nRowsRead = 1000
df1 = pd.read_csv('training.1600000.processed.noemoticon.csv', delimiter=',', nrows = nRowsRead)
df1.dataframeName = 'training.1600000.processed.noemoticon.csv'
nRow, nCol = df1.shape
print(f'There are {nRow} rows and {nCol} columns')

There are 1000 rows and 6 columns


In [None]:
df1.head(5)


Unnamed: 0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, that's a bummer. You shoulda got David Carr of Third Day to do it. ;D"
0,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
1,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
2,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
3,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."
4,0,1467811372,Mon Apr 06 22:20:00 PDT 2009,NO_QUERY,joy_wolf,@Kwesidei not the whole crew
