In [None]:
import pandas as pd
train_path = '/kaggle/input/datathon-case2-1/case2_1-datasaur/case2_part1_train/train_org.csv'
test_path = '/kaggle/input/datathon-case2-1/case2_1-datasaur/org_test_final.csv'
test = pd.read_csv(test_path, index_col=0)
sample_size = 28_672
train = pd.read_csv(train_path, index_col=0)[:sample_size]

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import joblib

X_train = train["DATA"].fillna("")
y_train = train["target"]
X_test = test["DATA"].fillna("")

# Step 1: Tokenization
X_train_tokens = X_train.str.split()
X_test_tokens = X_test.str.split()

# Step 2: Remove Stopwords (optional)
from nltk.corpus import stopwords

stop_words = (
    set(stopwords.words('english'))
    .union(set(stopwords.words('russian')))
    .union(set(stopwords.words('kazakh')))
)
X_train_filtered = X_train_tokens.apply(lambda tokens: [word for word in tokens if word.lower() not in stop_words])
X_test_filtered = X_test_tokens.apply(lambda tokens: [word for word in tokens if word.lower() not in stop_words])

# Step 3: Apply TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=3000)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train_filtered.apply(lambda tokens: ' '.join(tokens)))
joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')
X_test_tfidf = tfidf_vectorizer.transform(X_test_filtered.apply(lambda tokens: ' '.join(tokens)))
print("Finished TF-IDF for train and test")

# Initialize and train a Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=4, random_state=42, n_jobs=-1)
rf_classifier.fit(X_train_tfidf, y_train)
joblib.dump(rf_classifier, 'random_forest_classifier.pkl')
print("Finished model training")

# Predict on the test data
y_pred_test = rf_classifier.predict(X_test_tfidf)

# Assign the predictions
test["target"] = y_pred_test

# Save to a CSV file
test[["ID", "target"]].to_csv("result_13.csv", index=False)
print("Finished")


In [None]:
loaded_tfidf_vectorizer = joblib.load('tfidf_vectorizer.pkl')
loaded_rf_classifier = joblib.load('random_forest_classifier.pkl')

X_test[:1].head()
loaded_rf_classifier.predict(loaded_tfidf_vectorizer.transform(X_test[:1]))

____

In [None]:
# Another approach:

import pandas as pd

max_n_classes = 8192 // 2

df = pd.read_csv(train_path, index_col=0).fillna("")

# Get the count of each target value
target_counts = df['target'].value_counts()

# Sort the target values by their counts in descending order
sorted_target_values = target_counts.index[:max_n_classes]

# Create a new DataFrame to store the selected rows
selected_df = pd.DataFrame()

for target_value in sorted_target_values:
    # Filter rows with the current target value
    target_rows = df[df['target'] == target_value]
    
    # Calculate the number of rows to select for this target value
    max_rows_per_target = min(sample_size // max_n_classes, len(target_rows))
    
    # Select up to max_rows_per_target rows for this target
    selected_rows = target_rows.head(max_rows_per_target)
    
    # Append the selected rows to the new DataFrame
    selected_df = pd.concat([selected_df, selected_rows], ignore_index=True)

# Now selected_df contains the desired rows with an equal number of rows for each 
train = selected_df
train.shape, max_rows_per_target, train["target"].nunique()  # ((32768, 6), 4, 8192)