In [3]:
import numpy as np
import pandas as pd
import gzip
import json

from pprint import pprint

In [4]:
#@title Turkish StopWords

import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
turkish_stopwords = stopwords.words('turkish')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [6]:
annotated_users_df = pd.read_csv("/content/annotated_users_CS412-96d33240b352.csv")

train_classification_df = pd.read_csv("/content/train-classification.csv",)

train_classification_df = train_classification_df.rename(columns={'Unnamed: 0': 'user_id', 'label': 'category'})

annotated_users_df = annotated_users_df.rename (columns={'Unnamed: 0': 'user_id', 'influencerCategory': 'category'})
annotated_users_df.drop(columns=["url", "influencerMention","accountType"], inplace=True)
train_classification_df = pd.concat([train_classification_df, annotated_users_df], ignore_index=True)
train_classification_df = train_classification_df.dropna()
train_classification_df["category"] = train_classification_df["category"].apply(str.lower)
username2_category = train_classification_df.set_index("user_id").to_dict()["category"]


In [7]:
# stats about the labels
train_classification_df.groupby("category").count()

Unnamed: 0_level_0,user_id
category,Unnamed: 1_level_1
art,196
entertainment,360
fashion,306
food,529
gaming,19
health and lifestyle,529
mom and children,156
sports,126
tech,368
travel,302


In [8]:
train_data_path = "/content/training-dataset.jsonl.gz"

username2posts_train = dict()
username2profile_train = dict()

username2posts_test = dict()
username2profile_test = dict()


with gzip.open(train_data_path, "rt") as fh:
  for line in fh:
    sample = json.loads(line)

    profile = sample["profile"]
    username = profile["username"]
    if username in username2_category:
      # train data info
      username2posts_train[username] = sample["posts"]
      username2profile_train[username] = profile


    else:
      # it is test data info
      username2posts_test[username] = sample["posts"]
      username2profile_test[username] = profile


In [9]:
# Profile Dataframe
train_profile_df = pd.DataFrame(username2profile_train).T.reset_index(drop=True)
test_profile_df = pd.DataFrame(username2profile_test).T.reset_index(drop=True)


In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
import re

def preprocess_text(text: str):

    text = text.casefold()

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove special characters and punctuation
    text = re.sub(r'[^a-zçğıöşü0-9\s#@]', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text


corpus = []

# to keep the label order
train_usernames = []

for username, posts in username2posts_train.items():
  train_usernames.append(username)

  # aggregating the posts per user
  cleaned_captions = []
  for post in posts:
    post_caption = post.get("caption", "")
    if post_caption is None:
      continue

    post_caption = preprocess_text(post_caption)

    if post_caption != "":
      cleaned_captions.append(post_caption)


  # joining the posts of each user with a \n
  user_post_captions = "\n".join(cleaned_captions)
  corpus.append(user_post_captions)


vectorizer = TfidfVectorizer(stop_words=turkish_stopwords, max_features=5000)

# fit the vectorizer
vectorizer.fit(corpus)


# transform the data into vectors
x_post_train = vectorizer.transform(corpus)
y_train = [username2_category.get(uname, "NA") for uname in train_usernames]


test_usernames = []
test_corpus = []
for username, posts in username2posts_test.items():
  test_usernames.append(username)
  # aggregating the posts per user
  cleaned_captions = []
  for post in posts:
    post_caption = post.get("caption", "")
    if post_caption is None:
      continue

    post_caption = preprocess_text(post_caption)

    if post_caption != "":
      cleaned_captions.append(post_caption)

  user_post_captions = "\n".join(cleaned_captions)
  test_corpus.append(user_post_captions)


# Just transforming! No Fitting!!!!!
x_post_test = vectorizer.transform(test_corpus)

In [11]:
# Making sure everything is fine
assert y_train.count("NA") == 0

In [12]:
feature_names = vectorizer.get_feature_names_out()

In [13]:
df_tfidf = pd.DataFrame(x_post_train.toarray(), columns=feature_names)

In [14]:
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(df_tfidf, y_train, test_size=0.2, stratify=y_train)

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

# Initialize the Random Forest Classifier
rf_classifier = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)


# Train the model on the training data
rf_classifier.fit(x_train, y_train)

y_train_pred = rf_classifier.predict(x_train)

print("Accuracy:", accuracy_score(y_train, y_train_pred))
print("\nClassification Report:")
print(classification_report(y_train, y_train_pred, zero_division=0))


Accuracy: 0.7140311804008909

Classification Report:
                      precision    recall  f1-score   support

                 art       1.00      0.16      0.28       154
       entertainment       0.76      0.65      0.70       274
             fashion       0.91      0.74      0.82       239
                food       0.92      0.94      0.93       414
              gaming       0.00      0.00      0.00        13
health and lifestyle       0.44      0.97      0.61       412
    mom and children       1.00      0.17      0.29       119
              sports       1.00      0.06      0.12        96
                tech       0.94      0.84      0.89       285
              travel       0.94      0.70      0.80       239

            accuracy                           0.71      2245
           macro avg       0.79      0.52      0.54      2245
        weighted avg       0.82      0.71      0.69      2245



In [16]:
y_val_pred = rf_classifier.predict(x_val)

print("Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred, zero_division=0))

Accuracy: 0.5266903914590747

Classification Report:
                      precision    recall  f1-score   support

                 art       0.00      0.00      0.00        39
       entertainment       0.35      0.16      0.22        68
             fashion       0.82      0.53      0.65        60
                food       0.71      0.92      0.80       104
              gaming       0.00      0.00      0.00         3
health and lifestyle       0.34      0.78      0.47       103
    mom and children       0.00      0.00      0.00        30
              sports       0.00      0.00      0.00        24
                tech       0.62      0.61      0.61        71
              travel       0.64      0.57      0.60        60

            accuracy                           0.53       562
           macro avg       0.35      0.36      0.34       562
        weighted avg       0.47      0.53      0.47       562



In [17]:
#@title Test Data


# let's take a look at the first 5 lines of the file
test_data_path = "/content/test-classification-round3.dat"


test_unames = []
with open(test_data_path, "rt") as fh:
  for line in fh:
    test_unames.append(line.strip())


In [19]:
x_test = []

for uname in test_unames:
  try:
    index = test_usernames.index(uname)
    x_test.append(x_post_test[index].toarray()[0])
  except Exception as e:
    try:
      index = train_usernames.index(uname)
      x_test.append(x_post_train[index].toarray()[0])
    except Exception as e:
      print(uname)




In [20]:
df_test = pd.DataFrame(np.array(x_test), columns=feature_names)

In [21]:
test_pred = rf_classifier.predict(df_test)

output = dict()
for index, uname in enumerate(test_unames):
  output[uname] = test_pred[index]

In [22]:
with open("output.json", "w") as of:
  json.dump(output, of, indent=4)


In [23]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
import json

# Load the test_regression dataset
path = "/content/test-regression-round3.jsonl"
test_regression = pd.read_json(path, lines=True)

# Step 1: Flatten the username2posts_test dataset
all_posts = []

# Iterate through users and posts
for username, posts in username2posts_test.items():
    for post in posts:
        post['username'] = username  # Add username as a field
        all_posts.append(post)

# Convert to DataFrame
data = pd.DataFrame(all_posts)

# Combine the datasets
combined_data = pd.concat([data, test_regression], ignore_index=True)

# Step 2: Handle Missing Data
combined_data['caption'] = combined_data['caption'].fillna('')  # Replace missing captions with empty strings
combined_data['comments_count'] = combined_data['comments_count'].fillna(0)  # Fill missing comment counts with 0
combined_data = combined_data.dropna(subset=['like_count'])  # Drop rows with missing target values

# Feature Engineering
combined_data['caption_length'] = combined_data['caption'].apply(len)  # Length of the caption
combined_data['num_emojis'] = combined_data['caption'].apply(
    lambda x: sum(1 for char in x if char in '😀😃😄😁😆')
)  # Emoji detection

# Encode Media Type (if it exists)
if 'media_type' in combined_data.columns:
    combined_data = pd.get_dummies(combined_data, columns=['media_type'], drop_first=True)

# Prepare Features and Target
X = combined_data[['caption_length', 'num_emojis', 'comments_count']]
if 'media_type_IMAGE' in combined_data.columns:
    X['media_type_IMAGE'] = combined_data['media_type_IMAGE']  # Include media type if available

y = combined_data['like_count']

# Step 3: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Step 4: Train Model
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

# Step 5: Evaluate Model
y_pred = model.predict(X_test)
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred))

# Step 6: Predict on Test Data
test_data_predictions = model.predict(X_test)

# Step 7: Create Output Dictionary
output = {post_id: int(pred) for post_id, pred in zip(combined_data.loc[X_test.index, 'id'], test_data_predictions)}

# Step 8: Save Output to File
with open('predicted_like_counts.json', 'w') as f:
    json.dump(output, f, indent=4)

print("Predictions saved to 'predicted_like_counts.json'")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  combined_data['caption_length'] = combined_data['caption'].apply(len)  # Length of the caption
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X['media_type_IMAGE'] = combined_data['media_type_IMAGE']  # Include media type if available


Mean Absolute Error: 7010.5513429888315
Predictions saved to 'predicted_like_counts.json'
