In [1]:
import numpy as np
import pandas as pd
import gzip
import json

from pprint import pprint

In [2]:
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
turkish_stopwords = stopwords.words('turkish')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Osama\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [3]:
train_classification_df = pd.read_csv("/Users/Osama/Downloads/CS412PROJ/train-classification.csv",)
train_classification_df = train_classification_df.rename(columns={'Unnamed: 0': 'user_id', 'label': 'category'})

# Unifying labels
train_classification_df["category"] = train_classification_df["category"].apply(str.lower)
username2_category = train_classification_df.set_index("user_id").to_dict()["category"]

In [4]:
train_classification_df.groupby("category").count()

Unnamed: 0_level_0,user_id
category,Unnamed: 1_level_1
art,191
entertainment,323
fashion,299
food,511
gaming,13
health and lifestyle,503
mom and children,149
sports,113
tech,346
travel,294


In [5]:
username2_category["kod8net"]

'tech'

In [9]:
# Load the additional CSV file
additional_data_path = "/Users/Osama/Downloads/CS412PROJ/annotated_users_CS412-2753ef4cf74e.csv"
additional_data_df = pd.read_csv(additional_data_path)

# Extract and clean the data (renaming columns and handling the category)
additional_data_df = additional_data_df[['url', 'influencerCategory']].rename(columns={
    'url': 'user_id', 
    'influencerCategory': 'category'
})

# Make sure the category is lowercase (if required)
additional_data_df["category"] = additional_data_df["category"].apply(str.lower)

# Append the new data to the original train_classification_df
train_classification_df = pd.concat([train_classification_df, additional_data_df], ignore_index=True)

# Update the username2_category dictionary with the new data
username2_category.update(additional_data_df.set_index("user_id").to_dict()["category"])

# Check the updated data by viewing the first few rows
print(train_classification_df.head())

# Re-check the category distribution
print(train_classification_df.groupby("category").count())


TypeError: descriptor 'lower' for 'str' objects doesn't apply to a 'float' object

In [7]:
train_data_path = "/Users/Osama/Downloads/CS412PROJ/training-dataset.jsonl.gz"

username2posts_train = dict()
username2profile_train = dict()

username2posts_test = dict()
username2profile_test = dict()


with gzip.open(train_data_path, "rt") as fh:
  for line in fh:
    sample = json.loads(line)

    profile = sample["profile"]
    username = profile["username"]
    if username in username2_category:
      # train data info
      username2posts_train[username] = sample["posts"]
      username2profile_train[username] = profile


    else:
      # it is test data info
      username2posts_test[username] = sample["posts"]
      username2profile_test[username] = profile

In [None]:
train_profile_df = pd.DataFrame(username2profile_train).T.reset_index(drop=True)
test_profile_df = pd.DataFrame(username2profile_test).T.reset_index(drop=True)

train_profile_df.head(2)

In [None]:
test_profile_df.head(2)

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import re
import emoji


def preprocess_text(text: str):
    

    # lower casing Turkish Text, Don't use str.lower :)
    text = text.casefold()

    #text = emoji.demojize(text, delimiters=(" ", " "))

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove special characters and punctuation
    # HERE THE EMOJIS stuff are being removed, you may want to keep them :D
    text = re.sub(r'[^a-zçğıöşü0-9\s#@]', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text


corpus = []

# to keep the label order
train_usernames = []

for username, posts in username2posts_train.items():
  train_usernames.append(username)

  # aggregating the posts per user
  cleaned_captions = []
  for post in posts:
    post_caption = post.get("caption", "")
    if post_caption is None:
      continue

    post_caption = preprocess_text(post_caption)

    if post_caption != "":
      cleaned_captions.append(post_caption)


  # joining the posts of each user with a \n
  user_post_captions = "\n".join(cleaned_captions)
  corpus.append(user_post_captions)



#custom_stopwords = list(set(turkish_stopwords).union({
 #   'the', 'and', 'with', 'for', 'you', 'to', 'of', 'in', 'our', 'your', 'is', 'are','bir'
#}))
vectorizer = TfidfVectorizer(stop_words=turkish_stopwords, max_features=10000,min_df=10,ngram_range=(1, 3))

# fit the vectorizer
vectorizer.fit(corpus)

# transform the data into vectors
x_post_train = vectorizer.transform(corpus)
y_train = [username2_category.get(uname, "NA") for uname in train_usernames]
feature_names = vectorizer.get_feature_names_out()

# Inspect the frequency of each word
df_tfidf = pd.DataFrame(x_post_train.toarray(), columns=feature_names)

# Show the most frequent words (words in many posts)
print(df_tfidf.sum().sort_values(ascending=False).head(30))




test_usernames = []
test_corpus = []
for username, posts in username2posts_test.items():
  test_usernames.append(username)
  # aggregating the posts per user
  cleaned_captions = []
  for post in posts:
    post_caption = post.get("caption", "")
    if post_caption is None:
      continue

    post_caption = preprocess_text(post_caption)

    if post_caption != "":
      cleaned_captions.append(post_caption)

  user_post_captions = "\n".join(cleaned_captions)
  test_corpus.append(user_post_captions)


# Just transforming! No Fitting!!!!!
x_post_test = vectorizer.transform(test_corpus)

In [1289]:
# Making sure everything is fine
assert y_train.count("NA") == 0

In [None]:
feature_names = vectorizer.get_feature_names_out()
feature_names

In [None]:
df_tfidf = pd.DataFrame(x_post_train.toarray(), columns=feature_names)
df_tfidf.head(2)

In [None]:
df_tfidf.shape

In [1293]:
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(df_tfidf, y_train, test_size=0.2, stratify=y_train)

In [None]:
x_train.shape

In [None]:
x_val.shape

In [None]:
'''from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report


model = MultinomialNB()
model.fit(x_train, y_train)'''
from sklearn.svm import SVC
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.utils.class_weight import compute_class_weight
import numpy as np

import numpy as np
from collections import Counter

# Count the number of users in each class
class_counts = Counter(y_train)

# Total number of samples
total_samples = len(y_train)

# Calculate class weights using log10
'''log_class_weights = {
    cls: np.log10(1 + total_samples / count)  # Logarithmic scaling of class weight
    for cls, count in class_counts.items()
}
'''
unique_classes, class_counts = np.unique(y_train, return_counts=True)
log_class_weights = {cls: 1 / np.log10(1 + cnt) for cls, cnt in zip(unique_classes, class_counts)}
print("Class Weights with Log10 scaling:", log_class_weights)

class_weights = compute_class_weight(
    class_weight='balanced', 
    classes=np.unique(y_train), 
    y=y_train
)
class_weight_dict = {cls: weight for cls, weight in zip(np.unique(y_train), class_weights)}
print("Class Weights:", class_weight_dict)


from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, confusion_matrix

# Initialize SVM model with class weights
svm_model = LinearSVC(class_weight=log_class_weights,max_iter=2000,random_state=42)



#svm_model = LinearSVC(class_weight=class_weight_dict, max_iter=2000)
#non_linear_svm = SVC(kernel='rbf', class_weight=log_class_weights,max_iter=2000,random_state=42)
# Train the model
svm_model.fit(x_train, y_train)

In [None]:
'''from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np

# Calculate class weights using log10
unique_classes, class_counts = np.unique(y_train, return_counts=True)
class_weights = {cls: 1 / np.log10(1 + cnt) for cls, cnt in zip(unique_classes, class_counts)}

print("Class Weights (log10 adjusted):", class_weights)

# Create Logistic Regression model
log_reg = LogisticRegression(
    multi_class='multinomial',  # for multiclass classification
    solver='lbfgs',            # efficient solver for multiclass problems
    class_weight=class_weights,  # handle class imbalance
    max_iter=1000               # allow sufficient iterations for convergence
)

# Train the model
log_reg.fit(x_train, y_train)

# Predict on validation set
y_val_pred = log_reg.predict(x_val)

# Evaluate the model
print("Classification Report:")
print(classification_report(y_val, y_val_pred))

# Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(y_val, y_val_pred))
'''


In [None]:
'''from sklearn.model_selection import GridSearchCV
param_grid = {'svc__C': [1, 5, 10, 50],
              'svc__gamma': [0.0001, 0.0005, 0.001, 0.005]}
grid = GridSearchCV(svm_model, param_grid)

%time grid.fit(Xtrain, ytrain)
print(grid.best_params_)'''


In [None]:
#@title Train Data
y_train_pred = svm_model.predict(x_train)

print("Accuracy:", accuracy_score(y_train, y_train_pred))
print("\nClassification Report:")
print(classification_report(y_train, y_train_pred, zero_division=0))

In [None]:
y_val_pred = svm_model.predict(x_val)

print("Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred, zero_division=0))
print("Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))

In [None]:

from sklearn.feature_selection import chi2

chi2_scores, p_values = chi2(x_train, y_train)
feature_ranks = pd.DataFrame({
    "feature": feature_names,
    "chi2": chi2_scores
}).sort_values(by="chi2", ascending=False)

# Display top features
print(feature_ranks.head(50))

In [None]:
from sklearn.metrics import confusion_matrix

conf_matrix = confusion_matrix(y_val, y_val_pred, labels=np.unique(y_train))
conf_matrix_df = pd.DataFrame(
    conf_matrix, index=np.unique(y_train), columns=np.unique(y_train)
)

print("Confusion Matrix:")
print(conf_matrix_df)

# Analyze specific misclassifications
print("Misclassified samples:")
for i, (true, pred) in enumerate(zip(y_val, y_val_pred)):
    if true != pred:
        print(f"Sample {i}: True={true}, Predicted={pred}")

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Create bigrams and trigrams
vectorizer = TfidfVectorizer(ngram_range=(1, 3))  # This will generate unigrams, bigrams, and trigrams
X_train_tfidf = vectorizer.fit_transform(x_train)

# Apply chi-squared test again on the new features
chi2_scores, p_values = chi2(X_train_tfidf, y_train)

# Create a dataframe of the features and their chi2 scores
feature_names = vectorizer.get_feature_names_out()
feature_ranks = pd.DataFrame({
    "feature": feature_names,
    "chi2": chi2_scores
}).sort_values(by="chi2", ascending=False)

# Display top features
print(feature_ranks.head(20))


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# Compute the confusion matrix
cm = confusion_matrix(y_val, y_val_pred)

# Create a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix')
plt.xlabel('Predicted Labels')
plt.ylabel('True Labels')
plt.show()


In [None]:
# Ensure that validation data is properly referenced
# Assuming you have a list of usernames and captions for the validation set
val_usernames = train_usernames[len(x_train):]  # Example placeholder
val_corpus = corpus[len(x_train):]  # Example placeholder

# Ensure y_val and y_val_pred are aligned with validation usernames
y_val_aligned = y_val[:len(val_usernames)]
y_val_pred_aligned = y_val_pred[:len(val_usernames)]

# Recreate the DataFrame for validation data
val_data = pd.DataFrame({
    "Username": val_usernames,
    "True Label": y_val_aligned,
    "Predicted Label": y_val_pred_aligned,
    "Captions": val_corpus
})

# Identify misclassified samples
misclassified = val_data[val_data["True Label"] != val_data["Predicted Label"]]

# Print misclassified samples with their captions
print("Misclassified Samples with Captions:")
for _, row in misclassified.iterrows():
    print(f"Username: {row['Username']}")
    print(f"True Label: {row['True Label']}")
    print(f"Predicted Label: {row['Predicted Label']}")
    print(f"Captions: {row['Captions']}\n")


In [None]:
#@title Test Data
test_data_path = "/Users/Osama/Downloads/CS412PROJ/test-classification-round1.dat"

with open(test_data_path, "rt") as fh:
    for i, line in enumerate(fh):
        print(line.strip())
        if i >= 4:  # Stop after 5 lines
            break

print("*****")

test_unames = []
with open(test_data_path, "rt") as fh:
  for line in fh:
    test_unames.append(line.strip())

print(test_unames[:5])

In [None]:
x_test = []

for uname in test_unames:
  try:
    index = test_usernames.index(uname)
    x_test.append(x_post_test[index].toarray()[0])
  except Exception as e:
    try:
      index = train_usernames.index(uname)
      x_test.append(x_post_train[index].toarray()[0])
    except Exception as e:
      print(uname)


test_unames.remove("screenname")

In [None]:
df_test = pd.DataFrame(np.array(x_test), columns=feature_names)
df_test.head(2)

In [None]:
'''import gzip
import json

# Open the gzip-compressed JSONL file
with gzip.open('training-dataset.jsonl.gz', 'rt', encoding='utf-8') as f:
    for line in f:
        # Each line is a JSON object, so load it
        data = json.loads(line)
        print(data)  # You can process the data as needed
'''