# Import Dependencies

In [41]:
import numpy as np
import pandas as pd
import gzip
import json

from pprint import pprint

In [42]:
#@title Turkish StopWords

import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
turkish_stopwords = stopwords.words('turkish')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/muratdogan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Influencer Category Classification



1.   Read Data
2.   Preprocess Data
3.   Prepare Model
4.   Predict Test Data
4.   Save outputs



In [43]:
train_classification_df = pd.read_csv("train-classification.csv",)
train_classification_df = train_classification_df.rename(columns={'Unnamed: 0': 'user_id', 'label': 'category'})

# Unifying labels
train_classification_df["category"] = train_classification_df["category"].apply(str.lower)
username2_category = train_classification_df.set_index("user_id").to_dict()["category"]

In [44]:
# stats about the labels
train_classification_df.groupby("category").count()

Unnamed: 0_level_0,user_id
category,Unnamed: 1_level_1
art,191
entertainment,323
fashion,299
food,511
gaming,13
health and lifestyle,503
mom and children,149
sports,113
tech,346
travel,294


In [45]:
train_data_path = "training-dataset.jsonl.gz"

username2posts_train = dict()
username2profile_train = dict()

username2posts_test = dict()
username2profile_test = dict()


with gzip.open(train_data_path, "rt") as fh:
  for line in fh:
    sample = json.loads(line)

    profile = sample["profile"]
    username = profile["username"]
    if username in username2_category:
      # train data info
      username2posts_train[username] = sample["posts"]
      username2profile_train[username] = profile


    else:
      # it is test data info
      username2posts_test[username] = sample["posts"]
      username2profile_test[username] = profile


In [46]:
train_profile_df = pd.DataFrame(username2profile_train).T.reset_index(drop=True)
test_profile_df = pd.DataFrame(username2profile_test).T.reset_index(drop=True)


In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer
import re

def preprocess_text(text: str):
    # lower casing Turkish Text, Don't use str.lower :)
    text = text.casefold()

    # Remove URLs
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)

    # Remove special characters and punctuation
    # HERE THE EMOJIS stuff are being removed, you may want to keep them :D
    text = re.sub(r'[^a-zçğıöşü0-9\s#@]', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text


corpus = []

# to keep the label order
train_usernames = []

for username, posts in username2posts_train.items():
  train_usernames.append(username)

  # aggregating the posts per user
  cleaned_captions = []
  for post in posts:
    post_caption = post.get("caption", "")
    if post_caption is None:
      continue

    post_caption = preprocess_text(post_caption)

    if post_caption != "":
      cleaned_captions.append(post_caption)


  # joining the posts of each user with a \n
  user_post_captions = "\n".join(cleaned_captions)
  corpus.append(user_post_captions)


vectorizer = TfidfVectorizer(stop_words=turkish_stopwords, max_features=5000)

# fit the vectorizer
vectorizer.fit(corpus)


# transform the data into vectors
x_post_train = vectorizer.transform(corpus)
y_train = [username2_category.get(uname, "NA") for uname in train_usernames]


test_usernames = []
test_corpus = []
for username, posts in username2posts_test.items():
  test_usernames.append(username)
  # aggregating the posts per user
  cleaned_captions = []
  for post in posts:
    post_caption = post.get("caption", "")
    if post_caption is None:
      continue

    post_caption = preprocess_text(post_caption)

    if post_caption != "":
      cleaned_captions.append(post_caption)

  user_post_captions = "\n".join(cleaned_captions)
  test_corpus.append(user_post_captions)


# Just transforming! No Fitting!!!!!
x_post_test = vectorizer.transform(test_corpus)

In [48]:
assert y_train.count("NA") == 0

In [49]:
feature_names = vectorizer.get_feature_names_out()
feature_names

array(['abdullah', 'abone', 'about', ..., 'şık', 'şıklık', 'şıklığı'],
      dtype=object)

In [50]:
df_tfidf = pd.DataFrame(x_post_train.toarray(), columns=feature_names)
df_tfidf.head(2)

Unnamed: 0,abdullah,abone,about,acele,acil,activities,acı,ad,ada,adam,...,şubemiz,şubesi,şölen,şöleni,şöyle,şükranla,şükür,şık,şıklık,şıklığı
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.050596,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [51]:
df_tfidf.shape

(2741, 5000)

In [52]:
from sklearn.model_selection import train_test_split

x_train, x_val, y_train, y_val = train_test_split(df_tfidf, y_train, test_size=0.2, stratify=y_train)

In [53]:
x_train.shape

(2192, 5000)

In [54]:
x_val.shape

(549, 5000)

# Naive Base Classifier



In [55]:

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV


#model = MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)
#model.fit(x_train, y_train)

param_grid = {
    'alpha': [0.01, 0.1, 0.5, 1.0, 2.0, 10.0],  
    'fit_prior': [True, False],  
    'class_prior': [
        None,  
        [0.1] * 10,  
        [0.3] + [0.1] * 9,  
        [0.25] * 10  
    ],
}

model = MultinomialNB()


grid_search = GridSearchCV(estimator=model, param_grid=param_grid, cv=5, scoring='accuracy')
grid_search.fit(x_train, y_train)

print("Best Parameters:", grid_search.best_params_)


Best Parameters: {'alpha': 0.01, 'class_prior': None, 'fit_prior': True}


In [56]:
#@title Train Data

model = MultinomialNB(alpha=0.01, class_prior=None, fit_prior=True)
model.fit(x_train, y_train)
y_train_pred = model.predict(x_train)

print("Accuracy:", accuracy_score(y_train, y_train_pred))
print("\nClassification Report:")
print(classification_report(y_train, y_train_pred, zero_division=0))

Accuracy: 0.8531021897810219

Classification Report:
                      precision    recall  f1-score   support

                 art       0.91      0.76      0.83       153
       entertainment       0.82      0.79      0.80       258
             fashion       0.82      0.90      0.86       239
                food       0.94      0.90      0.92       409
              gaming       1.00      1.00      1.00        10
health and lifestyle       0.82      0.83      0.83       402
    mom and children       0.90      0.85      0.87       119
              sports       0.96      0.91      0.94        90
                tech       0.83      0.91      0.87       277
              travel       0.77      0.81      0.79       235

            accuracy                           0.85      2192
           macro avg       0.88      0.87      0.87      2192
        weighted avg       0.86      0.85      0.85      2192



In [57]:
#@title Validation Data
y_val_pred = model.predict(x_val)

print("Accuracy:", accuracy_score(y_val, y_val_pred))
print("\nClassification Report:")
print(classification_report(y_val, y_val_pred, zero_division=0))

Accuracy: 0.6284153005464481

Classification Report:
                      precision    recall  f1-score   support

                 art       0.38      0.16      0.22        38
       entertainment       0.42      0.51      0.46        65
             fashion       0.58      0.72      0.64        60
                food       0.79      0.86      0.82       102
              gaming       0.00      0.00      0.00         3
health and lifestyle       0.62      0.66      0.64       100
    mom and children       0.78      0.23      0.36        30
              sports       0.83      0.43      0.57        23
                tech       0.72      0.81      0.76        69
              travel       0.58      0.61      0.60        59

            accuracy                           0.63       549
           macro avg       0.57      0.50      0.51       549
        weighted avg       0.63      0.63      0.61       549



In [58]:
#@title Test Data


# let's take a look at the first 5 lines of the file
test_data_path = "test-classification-round3.dat"
!head -n 5 "$test_data_path"

print("*****")

test_unames = []
with open(test_data_path, "rt") as fh:
  for line in fh:
    test_unames.append(line.strip())

print(test_unames[:5])

livapastanesi
barisgross
tusasshop
etolyadigital
tugrulonur
*****
['livapastanesi', 'barisgross', 'tusasshop', 'etolyadigital', 'tugrulonur']


In [59]:
x_test = []

for uname in test_unames:
  try:
    index = test_usernames.index(uname)
    x_test.append(x_post_test[index].toarray()[0])
  except Exception as e:
    try:
      index = train_usernames.index(uname)
      x_test.append(x_post_train[index].toarray()[0])
    except Exception as e:
      print(uname)


In [60]:
df_test = pd.DataFrame(np.array(x_test), columns=feature_names)
df_test.head(2)

Unnamed: 0,abdullah,abone,about,acele,acil,activities,acı,ad,ada,adam,...,şubemiz,şubesi,şölen,şöleni,şöyle,şükranla,şükür,şık,şıklık,şıklığı
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.038429,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [61]:
test_pred = model.predict(df_test)

output = dict()
for index, uname in enumerate(test_unames):
  output[uname] = test_pred[index]

def capitalize_title(category):
    # the words need to be not capital
    exceptions = {"and"}
    words = category.split()
  
    capitalized_words = [
        word.capitalize() if word.lower() not in exceptions else word.lower()
        for word in words
    ]
    return " ".join(capitalized_words)

In [62]:
output = {uname: capitalize_title(category) for uname, category in output.items()}

with open("output.json", "w") as of:
  json.dump(output, of, indent=4)

# Like Count Prediction


In [63]:
from datetime import datetime

In [64]:
HYPERPARAMS = {
    "regularization": 0.1,  # Regularization factor to adjust the average
    "time_decay": 0.05    # Decay factor for timestamp weighting
}

def predict_like_count(username, current_post=None):

    def get_avg_like_count(posts:list):
        total = 0.
        total_weight = 0.0
        
        for post in posts:
            if current_post is not None and post["id"] == current_post["id"]:
                continue

            like_count = post.get("like_count", 0)
            if like_count is None:
                like_count = 0

            timestamp = post.get("timestamp", None)
            if timestamp:
                post_date = datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S")
                days_since_post = (datetime.now() - post_date).days
                weight = 1 / (1 + HYPERPARAMS["time_decay"] * days_since_post)
            else:
                weight = 1.0

            weight = weight / len(posts)

            total += like_count * weight
            total_weight += weight
        
        if len(posts) == 0:
            return 0.

        if total_weight == 0:
            return 0.0
        if username in username2posts_train:
            follower_count = train_profile_df[train_profile_df['username'] == username]['follower_count'].values[0]
        else:
            follower_count = test_profile_df[test_profile_df['username'] == username]['follower_count'].values[0]
   

    
        if total / follower_count < 0.2:
            user_regularization = HYPERPARAMS["regularization"] * 2
            return (total / total_weight) * (1 - user_regularization)
        return (total / total_weight) * (1 - HYPERPARAMS["regularization"])
    

    if username in username2posts_train:
        return get_avg_like_count(username2posts_train[username])
    elif username in username2posts_test:
        return get_avg_like_count(username2posts_test[username])
    else:
        print(f"No data available for {username}")
        return -1

In [65]:
def log_mse_like_counts(y_true, y_pred):

  
  y_true = np.array(y_true)
  y_pred = np.array(y_pred)

  
  log_y_true = np.log1p(y_true)
  log_y_pred = np.log1p(y_pred)

  
  squared_errors = (log_y_true - log_y_pred) ** 2

  
  return np.mean(squared_errors)

In [66]:
#@title Train Dataset evaluation

y_like_count_train_true = []
y_like_count_train_pred = []
for uname, posts in username2posts_train.items():
  for post in posts:
    pred_val = predict_like_count(uname, post)
    true_val = post.get("like_count", 0)
    if true_val is None:
      true_val = 0

    y_like_count_train_true.append(true_val)
    y_like_count_train_pred.append(pred_val)

print(f"Log MSE Train= {log_mse_like_counts(y_like_count_train_true, y_like_count_train_pred)}")

Log MSE Train= 1.1401158884334879


In [67]:
#@title Test Dataset

path = "test-regression-round3.jsonl"

to_predict_like_counts_usernames = []
output_list = []
dic = {}
a = 0
with open(path, "rt") as fh:
    for line in fh:
        sample = json.loads(line)
        
        pred_val = predict_like_count(sample["username"])
        sample["like_count"] = int(pred_val)
        dic[sample["id"]] = sample["like_count"]
    
with open("prediction-regression-round", "wt") as of:
    for key, value in dic.items():
        of.write(f"{key}: {value}\n")

In [68]:
input_path = "prediction-regression-round"    

data = {}


with open(input_path, "r") as file:
    for line in file:
        line = line.strip() 
        if line:
            key, value = line.split(":") 
            key = key.strip() 
            value = int(value.strip())
            data[key] = value 


with open("prediction-regression-round3.json", "w") as json_file:
    json.dump(data, json_file, indent=4)
