## Imports

In [1]:
import json
import pandas as pd
import numpy as np
import joblib
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

## Open our data

In [2]:
df_data = []
with open('kfxm7vdg5ezrxirbnfen5uiuze.json', encoding='utf-8') as jsonFile:
    for line in jsonFile.read().splitlines():
        data = json.loads(line)['Item']
        link = data['link']['S']
        appearances = int(data['appearances']['N'])
        features = []
        if data.get('features'):
            for feature in sorted(data['features']['M']):
                value = int(data['features']['M'][feature]['N'])
                features.append(value)
            df_data.append([link, appearances] + features)

columns = ['link', 'appearances'] + list(sorted(data['features']['M']))

In [3]:
df = pd.DataFrame(df_data, columns = columns)

In [4]:
df.head()

Unnamed: 0,link,appearances,feature1,feature10,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9
0,https://accounts.google.com/signin/usernamerec...,1,144,3,4,37,0,0,113,7,57,57
1,https://support.google.com/accounts/answer/173...,3,56,3,5,11,0,0,39,7,33,33
2,https://support.google.com/youtube/answer/2802027,1,49,3,5,12,0,0,34,7,26,26
3,https://maps.google.com.br/maps?hl=pt-BR&tab=8l,16,47,3,3,7,0,1,34,1,23,23
4,https://www.gstatic.com/policies/terms/pdf/202...,4,97,3,8,18,0,0,67,12,3,73


In [11]:
df[df.link == 'https://www.google.com']

Unnamed: 0,link,appearances,feature1,feature10,feature2,feature3,feature4,feature5,feature6,feature7,feature8,feature9
182,https://www.google.com,1,22,3,2,4,0,0,17,0,3,11


## Getting X and y and split in train and test sets

In [5]:
X = df.drop(columns = ['link', 'appearances'])
y = df['appearances']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3)

## Training and evaluation

In [7]:
rf = RandomForestRegressor(n_estimators=100, max_features=10, max_depth=10)

In [8]:
rf.fit(X_train, y_train)

RandomForestRegressor(max_depth=10, max_features=10)

In [9]:
y_pred = rf.predict(X_test).astype(int)
np.sqrt(mean_squared_error(y_pred, y_test))

13.259901960421878

## Model saving

In [10]:
filename = 'model_appearance.joblib'
joblib.dump(rf, filename)

['model_appearance.joblib']