Pengumpulan Data

In [398]:
# Pengumpulan Data
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')

# Load the data
file_path = 'tweets.csv'
data = pd.read_csv(file_path)

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\estar\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\estar\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Pre-Proccessing

In [399]:
# Pre-Processing
# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('indonesian'))
lemmatizer = WordNetLemmatizer()

# Clean tweet text
def clean_tweet(tweet):
    tweet = re.sub(r'http\S+', '', tweet)  # Remove URLs
    tweet = re.sub(r'\W', ' ', tweet)  # Remove punctuation
    tweet = tweet.lower()  # Convert to lowercase
    tweet = ' '.join([word for word in tweet.split() if word not in stop_words])  # Remove stopwords
    tweet = ' '.join([lemmatizer.lemmatize(word) for word in tweet.split()])  # Lemmatize words
    return tweet

# Apply cleaning to tweets
data['cleaned_tweet'] = data['full_text'].apply(clean_tweet)

Pembobotan

In [400]:
# Function to load words from a file
def load_words(file_path):
    with open(file_path, 'r') as file:
        words = file.read().splitlines()
    return words

# Load positive and negative words
positive_words_file = 'positive.txt'
negative_words_file = 'negative.txt'

# Label Sentiment
# Function to label sentiment based on keywords
def label_sentiment(text):
    positive_keywords = load_words(positive_words_file)
    negative_keywords = load_words(negative_words_file)
    
    text = text.lower()
    if any(word in text for word in positive_keywords):
        return 'positive'
    elif any(word in text for word in negative_keywords):
        return 'negative'
    else:
        return 'neutral'

# Apply the function to create a sentiment column
data['sentiment'] = data['cleaned_tweet'].apply(label_sentiment)

Analisis Sentimen

In [401]:
# Sentiment Analysis using TextBlob
def analyze_sentiment(text):
    analysis = TextBlob(text)
    # Classify the polarity as positive, negative, or neutral
    if analysis.sentiment.polarity > 0:
        return 'positive'
    elif analysis.sentiment.polarity < 0:
        return 'negative'
    else:
        return 'neutral'
    
# Apply sentiment analysis to cleaned tweets
data['textblob_sentiment'] = data['cleaned_tweet'].apply(analyze_sentiment)

Extract Feature

In [402]:
# Extract Feature
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Extract features using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(data['cleaned_tweet'])
y = data['sentiment']
original_indices = data.index

# Split the data
X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(
    X, y, original_indices, test_size=0.2, random_state=42
)

Modelling

In [403]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

# Preprocess the text data: convert to lowercase and ensure all entries are strings
X_train = [str(text).lower() for text in X_train]
X_test = [str(text).lower() for text in X_test]

# Vectorize the text data
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Define the model and hyperparameters grid
model_rf = RandomForestClassifier()
param_grid_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20, 30],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Perform Grid Search for RandomForest
grid_search_rf = GridSearchCV(estimator=model_rf, param_grid=param_grid_rf, cv=3, n_jobs=-1, verbose=2)
grid_search_rf.fit(X_train_tfidf, y_train)

print("Best RandomForest Model:", grid_search_rf.best_params_)
print("Best Score for RandomForest:", grid_search_rf.best_score_)


Fitting 3 folds for each of 108 candidates, totalling 324 fits
Best RandomForest Model: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 5, 'n_estimators': 100}
Best Score for RandomForest: 0.620155241583813


In [404]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorize the text data
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Define the model and hyperparameters grid
model_lr = LogisticRegression(max_iter=500, solver='saga')
param_grid_lr = {
    'scaler': [StandardScaler(with_mean=False)],  # Include the scaler in the grid
    'model__C': [0.01, 0.1, 1, 10, 100],  # Prefix parameters with 'model__'
    'model__penalty': ['l2']  # Correct penalty options
}

# Use Pipeline to include StandardScaler
pipeline_lr = Pipeline([
    ('scaler', StandardScaler(with_mean=False)),
    ('model', model_lr)
])

# Perform Grid Search for Logistic Regression
grid_search_lr = GridSearchCV(estimator=pipeline_lr, param_grid=param_grid_lr, cv=3, n_jobs=-1, verbose=2)
grid_search_lr.fit(X_train_tfidf, y_train)

print("Best LogisticRegression Model:", grid_search_lr.best_params_)
print("Best Score for LogisticRegression:", grid_search_lr.best_score_)


Fitting 3 folds for each of 5 candidates, totalling 15 fits
Best LogisticRegression Model: {'model__C': 0.01, 'model__penalty': 'l2', 'scaler': StandardScaler(with_mean=False)}
Best Score for LogisticRegression: 0.6763561834990407


In [405]:
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer

# Vectorize the text data
tfidf_vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

# Define the model and hyperparameters grid
model_svm = SVC()
param_grid_svm = {
    'scaler': [StandardScaler(with_mean=False)],  # Include the scaler in the grid
    'model__C': [0.1, 1, 10],
    'model__kernel': ['linear', 'rbf'],
    'model__gamma': ['scale', 'auto']
}

# Use Pipeline to include StandardScaler
pipeline_svm = Pipeline([
    ('scaler', StandardScaler(with_mean=False)),
    ('model', model_svm)
])

# Perform Grid Search for SVM
grid_search_svm = GridSearchCV(estimator=pipeline_svm, param_grid=param_grid_svm, cv=3, n_jobs=-1, verbose=2)
grid_search_svm.fit(X_train_tfidf, y_train)

print("Best SVM Model:", grid_search_svm.best_params_)
print("Best Score for SVM:", grid_search_svm.best_score_)


Fitting 3 folds for each of 12 candidates, totalling 36 fits
Best SVM Model: {'model__C': 0.1, 'model__gamma': 'scale', 'model__kernel': 'linear', 'scaler': StandardScaler(with_mean=False)}
Best Score for SVM: 0.6542124542124542


evaluasi dan visualisasi

In [407]:
# Evaluate model
y_pred = grid_search_svm.predict(X_test)
print(classification_report(y_test, y_pred))

# Plot confusion matrix
cm = confusion_matrix(y_test, y_pred, labels=['positive', 'negative', 'neutral'])
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=['positive', 'negative', 'neutral'])
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.show()

# Visualisasi
# Additional Visualizations
import seaborn as sns

# Sentiment Distribution
plt.figure(figsize=(8, 6))
sns.countplot(x='sentiment', data=data, order=['positive', 'negative', 'neutral'])
plt.title('Sentiment Distribution')
plt.xlabel('Sentiment')
plt.ylabel('Count')
plt.show()

# Feature Importance
feature_importances = grid_search_svm.best_estimator_.coef_.toarray().flatten()
feature_names = vectorizer.get_feature_names_out()
feature_df = pd.DataFrame({'feature': feature_names, 'importance': feature_importances})
feature_df = feature_df.sort_values(by='importance', ascending=False).head(20)

plt.figure(figsize=(10, 8))
sns.barplot(x='importance', y='feature', data=feature_df)
plt.title('Top 20 Important Features')
plt.xlabel('Importance')
plt.ylabel('Feature')
plt.show()

# Save results to CSV
output_data = data[['full_text', 'time', 'cleaned_tweet', 'sentiment', 'textblob_sentiment']]
output_data.to_csv('tweets_with_sentiment.csv', index=False)

ValueError: could not convert string to float: "<compressed sparse row sparse matrix of dtype 'float64'\n\twith 6 stored elements and shape (1, 2669)>\n  coords\tvalues\n  (0, 964)\t0.06844262183600333\n  (0, 1840)\t0.5857061903725245\n  (0, 1264)\t0.38511145599908564\n  (0, 1754)\t0.36348041165106076\n  (0, 298)\t0.4179512879945823\n  (0, 1036)\t0.4440177290868321"