# DATA PREPROCESSING

___

### IMPORT LIBRARIES

In [1]:
import pandas as pd
import datetime
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from catboost import CatBoostClassifier
from sklearn.model_selection import train_test_split
import re
import string

import nltk
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

[nltk_data] Downloading package wordnet to /home/karpov/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### LOADING DATAFRAMES

In [2]:
user_data = pd.read_csv('user_data.csv')
post_text_df = pd.read_csv('post_text_df.csv')
feed_data = pd.read_csv('feed_data.csv')
count_actions = pd.read_csv('counT_actions.csv')

In [3]:
print(user_data.shape)
print(post_text_df.shape)
print(feed_data.shape)
print(count_actions.shape)

(163205, 8)
(7023, 3)
(1000000, 5)
(163205, 2)


In [4]:
user_data.head()

Unnamed: 0,user_id,gender,age,country,city,exp_group,os,source
0,200,1,34,Russia,Degtyarsk,3,Android,ads
1,201,0,37,Russia,Abakan,0,Android,ads
2,202,1,17,Russia,Smolensk,4,Android,ads
3,203,0,18,Russia,Moscow,1,iOS,ads
4,204,0,36,Russia,Anzhero-Sudzhensk,3,Android,ads


In [5]:
count_actions['count_actions'] = count_actions['count']

In [6]:
count_actions = count_actions.drop('count', axis=1)
count_actions

Unnamed: 0,user_id,count_actions
0,200,401
1,201,748
2,202,724
3,203,382
4,204,161
...,...,...
163200,168548,382
163201,168549,274
163202,168550,407
163203,168551,525


In [7]:
user_data = pd.merge(user_data, count_actions, on='user_id', how='left')

### POST_TEXT_DF. TEXT PREPROCESSING USING TFIDF AND PCA. ONE HOT ENCODING

In [8]:
wnl = WordNetLemmatizer()

def preprocessing(line, token=wnl):
    line = line.lower()
    line = re.sub(r"[{}]".format(string.punctuation), " ", line)
    line = line.replace('\n\n', ' ').replace('\n', ' ')
    return ' '.join([token.lemmatize(x) for x in line.split()])


def tfidf_func(df: pd.DataFrame, column: str):
    
    tfidf_vectorizer = TfidfVectorizer(stop_words='english', 
                                       preprocessor=preprocessing, 
                                       max_features=10000, 
                                       min_df=5, 
                                       max_df=0.85)
    tfidf_matrix = tfidf_vectorizer.fit_transform(df[column])

    prefix = "tfidf_"
    columns_with_prefix = [prefix + col for col in tfidf_vectorizer.get_feature_names()]

    tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=columns_with_prefix)
    
    return tfidf_df

In [9]:
tfidf_df = tfidf_func(post_text_df, 'text')



In [10]:
tfidf_df

Unnamed: 0,tfidf_00,tfidf_000,tfidf_000m,tfidf_007,tfidf_01,tfidf_03,tfidf_04,tfidf_05,tfidf_06,tfidf_07,...,tfidf_zeppelin,tfidf_zero,tfidf_zeta,tfidf_zhang,tfidf_zimbabwe,tfidf_zip,tfidf_zombie,tfidf_zone,tfidf_zoom,tfidf_zurich
0,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
1,0.0,0.144911,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
2,0.0,0.055949,0.0,0.0,0.0,0.058706,0.0,0.0,0.0,0.05528,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.056265
3,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
4,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7018,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
7019,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
7020,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000
7021,0.0,0.000000,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.00000,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000


In [11]:
def get_PCA(n_components: int, data: pd.DataFrame):
    
    centered = data - data.mean()
    pca = PCA(n_components) 
    return pca.fit_transform(centered)

In [12]:
pca_matrix = get_PCA(50, tfidf_df)

In [13]:
col_names = [f'feature_{i + 1}' for i in range(50)]

In [14]:
pca_df = pd.DataFrame(pca_matrix, columns=col_names)

In [15]:
pca_df

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,feature_10,...,feature_41,feature_42,feature_43,feature_44,feature_45,feature_46,feature_47,feature_48,feature_49,feature_50
0,0.005147,0.194684,0.026514,-0.073550,-0.149519,-0.025246,0.048262,-0.171380,-0.133012,-0.096547,...,-0.011544,-0.023413,0.011791,-0.001552,0.017969,-0.044012,0.003677,-0.019558,-0.000491,0.003684
1,-0.000803,0.218085,0.067561,0.077333,-0.054571,-0.002832,0.005895,-0.026975,-0.014726,0.013766,...,0.017010,0.056739,-0.029615,0.021024,-0.043534,0.063448,0.003640,0.016381,0.009415,0.030641
2,-0.005729,0.163478,0.016925,-0.098532,-0.153892,-0.024022,0.039040,-0.133542,-0.084589,-0.096316,...,0.047376,-0.035591,-0.035618,0.047443,-0.021076,0.079652,-0.008630,-0.023912,-0.013965,-0.043979
3,0.010938,0.168339,0.025062,-0.063092,-0.153456,-0.016489,0.054549,-0.080640,-0.020966,-0.058244,...,0.037860,-0.093940,0.023425,-0.027543,-0.000297,-0.001030,0.021421,-0.000866,0.011163,0.009294
4,0.000350,0.122627,0.010034,-0.040647,-0.059208,-0.006183,-0.003240,-0.012149,0.010271,0.008270,...,-0.006245,-0.013095,0.021840,0.032503,0.018094,0.002465,0.000435,-0.014145,-0.012585,0.013169
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7018,-0.164127,-0.151603,0.168022,-0.013667,0.019653,-0.006836,0.012401,0.029391,-0.024342,-0.037239,...,0.035005,-0.060823,-0.004729,-0.048264,-0.036474,-0.016131,-0.038833,0.010313,-0.006126,0.065839
7019,-0.136358,-0.129865,0.106072,-0.004081,-0.055464,-0.004486,-0.006977,-0.005671,-0.015139,0.025607,...,0.019378,-0.012702,-0.009105,0.019364,-0.032426,-0.027241,-0.036319,0.026237,0.033488,0.017149
7020,-0.102380,-0.070634,-0.168156,0.079848,-0.034363,-0.001907,-0.012953,0.018627,-0.002254,-0.046368,...,0.049466,-0.022657,0.030182,0.026670,0.016062,0.001199,-0.032056,-0.075910,0.004267,-0.030648
7021,-0.112783,-0.035907,-0.101418,0.058777,0.020219,-0.005851,0.033735,0.004234,-0.036716,-0.005721,...,0.010099,0.023951,-0.043248,-0.009566,-0.057804,0.008819,-0.024724,-0.009627,-0.001237,-0.005306


In [16]:
new_post_text_df = post_text_df.drop('text', axis=1)

In [17]:
new_post_text_df = pd.concat([new_post_text_df, pca_df], axis=1)

In [18]:
new_post_text_df.to_csv('new_post_text_df.csv', index=False)

In [19]:
category_features = ['topic']

### FEED_DATA PREPROCESSING. DATETIME

In [20]:
feed_data['timestamp'] = pd.to_datetime(feed_data['timestamp'])

In [21]:
feed_data = feed_data.drop('action', axis=1)

In [22]:
feed_data['month'] = feed_data['timestamp'].dt.month
feed_data['day'] = feed_data['timestamp'].dt.day
feed_data['second'] = feed_data['timestamp'].dt.second
feed_data['weekday'] = feed_data['timestamp'].dt.weekday
feed_data['is_weekend'] = feed_data['weekday'].isin([5, 6]).astype(int)
bins = [0, 6, 12, 18, 24]
labels = ['Night', 'Morning', 'Afternoon', 'Evening']
feed_data['part_of_day'] = pd.cut(feed_data['timestamp'].dt.hour, bins=bins, labels=labels, right=False)

In [23]:
new_feed_data = feed_data

In [24]:
new_feed_data.to_csv('new_feed_data', index=False)

In [25]:
for i in ['month', 'day', 'weekday', 'part_of_day']:
    category_features.append(i)
    
print(category_features)

['topic', 'month', 'day', 'weekday', 'part_of_day']


### USER_DATA PREPROCESSING

In [26]:
bins = [0, 25, 50, float('inf')]
labels = ['young', 'adult', 'old']
user_data['category_of_age'] = pd.cut(user_data['age'], bins=bins, labels=labels, right=False)

In [27]:
new_user_data = user_data

In [28]:
new_user_data.to_csv('new_user_data.csv', index=False)

In [29]:
for i in user_data.columns:
    if i not in ['age', 'gender', 'count_actions']:
        category_features.append(i)
        
print(category_features)

['topic', 'month', 'day', 'weekday', 'part_of_day', 'user_id', 'country', 'city', 'exp_group', 'os', 'source', 'category_of_age']


____