In [25]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import hstack

In [2]:
#Data Loading and Preprocessing
data = pd.read_csv("checkworthiness_labeled.csv")  



In [3]:
data.head()

Unnamed: 0,ID,Text,Category
0,16,I think we've seen a deterioration of values.,No
1,17,I think for a while as a nation we condoned th...,No
2,20,"We got away, we got into this feeling that val...",No
3,21,And I don't believe that at all I do believe t...,No
4,22,"And, of course, as far as the how we make it b...",No


In [4]:
missing_values = data.isnull().sum()
missing_values

ID          0
Text        0
Category    0
dtype: int64

In [5]:
data.shape[0]

8993

In [6]:
# Data Preprocessing
######
def preprocess_text(text):
    # lowercase
    text = text.lower()  
    # remove punctuation and special characters
    text = re.sub(r"[^a-z0-9\s]", "", text)  
    # remove extra whitespace
    text = re.sub(r"\s+", " ", text)  
    return text

data["Text"] = data["Text"].apply(preprocess_text)





In [7]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

In [18]:
def tokenize_and_lemmatize(text):
    tokens = nltk.word_tokenize(text)
    lemmas = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]
    return lemmas

In [19]:
#Feature Engineering
# 3.1 TF-IDF Features
#tfidf_vectorizer = TfidfVectorizer(max_features=2000, ngram_range=(1, 2))  # Consider unigrams and bigrams
tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize_and_lemmatize, max_features=5000)
tfidf_features = tfidf_vectorizer.fit_transform(data["Text"])




In [10]:
# 3.2 Count-Based Features (e.g., word count, punctuation count)
'''def count_features(text):
    word_count = len(text.split())
    punctuation_count = len(re.findall(r"[^\w\s]", text))
    return pd.Series({'word_count': word_count, 'punctuation_count': punctuation_count})

count_features_df = data["Text"].apply(count_features)
'''


In [20]:
# Count Vectorizer Features (for N-grams)
count_vectorizer = CountVectorizer(ngram_range=(1, 2), tokenizer=tokenize_and_lemmatize, max_features=3000)
count_features = count_vectorizer.fit_transform(data["Text"])

In [26]:
# 3.3 Combine Features
#X = pd.concat([count_features_df, pd.DataFrame(tfidf_vectorizer.fit_transform(data["Text"]).toarray())], axis=1)
#y = data["Category"]
# Combine TF-IDF and Count Features
combined_features = hstack((tfidf_features, count_features))
#combined_features = pd.SparseDataFrame.hstack((tfidf_features, count_features))

In [27]:
# Dimensionality Reduction (Optional)
svd = TruncatedSVD(n_components=300)  # Adjust n_components as needed
reduced_features = svd.fit_transform(combined_features)

In [28]:
# Feature Extraction
#convert the raw text data into numerical features
# Using TF-IDF for feature extraction
#TF-IDF measures the importance of a word in a document relative to a collection of documents
##vectorizer = TfidfVectorizer(max_features=1000)  
#number of features is 1000 to reduce dimensionality of the feature space
##X = vectorizer.fit_transform(data['Text'])
##y = data['Category']


In [12]:
# Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [28]:
# Model Training
#binary classification
X_train, X_test, y_train, y_test = train_test_split(reduced_features, data["Category"], test_size=0.2, random_state=42)
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)



In [29]:
# Evaluation
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
print("Classification Report:")
print(classification_report(y_test, y_pred))



Accuracy: 0.8232351306281267
Classification Report:
              precision    recall  f1-score   support

          No       0.83      0.98      0.90      1414
         Yes       0.79      0.24      0.37       385

    accuracy                           0.82      1799
   macro avg       0.81      0.61      0.63      1799
weighted avg       0.82      0.82      0.78      1799



In [31]:
# Prediction
predict_data = pd.read_csv("checkworthiness_leaderboard.csv")  # Load the data for prediction

In [32]:
# Preprocess the text data in the 'predict.csv' file
predict_data["Text"] = predict_data["Text"].apply(preprocess_text)

# Apply feature engineering to the new data
predict_tfidf_features = tfidf_vectorizer.transform(predict_data["Text"])
predict_count_features = count_vectorizer.transform(predict_data["Text"])
predict_combined_features = hstack((predict_tfidf_features, predict_count_features))
predict_reduced_features = svd.transform(predict_combined_features)  # Apply dimensionality reduction

# Make predictions
predictions = model.predict(predict_reduced_features)



In [None]:
# Prediction
leaderboard_data = pd.read_csv("checkworthiness_leaderboard.csv")
X_leaderboard = vectorizer.transform(leaderboard_data['Text'])
predictions = clf.predict(X_leaderboard)



In [34]:
# Save predictions to 'sub.csv'
submission_data = pd.DataFrame({"ID": predict_data["ID"], "Category": predictions})
submission_data.to_csv("submission.csv", index=False)

In [50]:
submission.shape[0]

1467