In [1]:
import pickle
import pandas as pd
import collections
import os

In [2]:
def early_bird(lst_time):
    # 5 am to 8 am
    res = 0
    for time in lst_time:
        if 5 <= time.hour <= 8:
            res += 1
    return (res / len(lst_time)) >= 0.5

In [3]:
def night_owl(lst_time):
    # 12 am to 3 am
    res = 0
    for time in lst_time:
        if 0 <= time.hour <= 3:
            res += 1
    return (res / len(lst_time)) >= 0.5

In [4]:
def prefered_hour(lst_time):
    return collections.Counter([time.hour for time in lst_time]).most_common(1)[0][0]

In [5]:
def weekend_tweeter(lst_time):
    # saturday and sunday as weekend (5 and 6)
    res = 0
    for time in lst_time:
        if 5 <= time.weekday() <= 6:
            res += 1
    return (res / len(lst_time)) >= 0.5

In [6]:
def week_tweeter(lst_time):
    # monday to friday
    res = 0
    for time in lst_time:
        if 0 <= time.weekday() <= 4:
            res += 1
    return (res / len(lst_time)) >= 0.5

In [7]:
def prefered_weekday(lst_time):
    return collections.Counter([time.weekday() for time in lst_time]).most_common(1)[0][0]

In [8]:
def extract_features(person):
    res = {}
    res['creation_year'] = person[1]['created_at'].year
    res['early_bird'] = early_bird(person[0])
    res['night_owl'] = night_owl(person[0])
    res['prefered_hour'] = prefered_hour(person[0])
    res['weekend_tweeter'] = weekend_tweeter(person[0])
    res['week_tweeter'] = week_tweeter(person[0])
    res['prefered_weekday'] = prefered_weekday(person[0])
    res['friends_count'] = person[1]['friends_count']
    res['followers_count'] = person[1]['followers_count']
    res['favourites_count'] = person[1]['favourites_count']
    res['geo_enabled'] = person[1]['geo_enabled']
    res['tweets_count'] = person[1]['statuses_count']
    
    return res

In [9]:
df = pd.DataFrame(columns=['id', 'creation_year', 'early_bird', 'night_owl', 'prefered_hour', 'weekend_tweeter', 'week_tweeter', 'prefered_weekday', 
                          'friends_count', 'followers_count', 'favourites_count', 'geo_enabled', 'tweets_count'])

In [10]:
for file in os.listdir('files/'):
    with open(f'files/{file}', 'rb') as f:
        raw_data = pickle.load(f)
    for k in raw_data.keys():
        row = extract_features(raw_data[k])
        row['id'] = k
        df = df.append(row, ignore_index=True)

In [11]:
df

Unnamed: 0,id,creation_year,early_bird,night_owl,prefered_hour,weekend_tweeter,week_tweeter,prefered_weekday,friends_count,followers_count,favourites_count,geo_enabled,tweets_count
0,35252,2008,False,False,18,False,True,6,87,124,21,True,575
1,35253,2014,False,False,21,False,True,6,67,522,314,True,1089
2,35257,2011,False,False,20,False,True,6,399,72,194,True,718
3,35258,2010,False,False,6,False,True,0,302,17,705,True,1162
4,35259,2010,False,False,4,False,True,3,121,119,9,True,3251
...,...,...,...,...,...,...,...,...,...,...,...,...,...
35495,19903,2010,False,False,20,False,True,5,1979,2455,175242,True,159204
35496,19907,2018,False,False,2,False,True,1,160,48,5820,False,1432
35497,19908,2015,False,False,17,False,True,6,1257,171,1203,True,1965
35498,19913,2013,False,False,18,False,True,2,112,22,1069,True,3560


In [16]:
ratings = pd.read_csv('ratings.dat', delimiter='::', names=['user_id', 'film_id', 'rate', 'timestamp'])

  ratings = pd.read_csv('ratings.dat', delimiter='::', names=['user_id', 'film_id', 'rate', 'timestamp'])


In [17]:
ratings

Unnamed: 0,user_id,film_id,rate,timestamp
0,1,114508,8,1381006850
1,2,499549,9,1376753198
2,2,1305591,8,1376742507
3,2,1428538,1,1371307089
4,3,75314,1,1595468524
...,...,...,...,...
906178,70732,9893250,10,1613857551
906179,70732,9898858,3,1585958452
906180,70733,172495,10,1587107015
906181,70733,414387,10,1587107852


In [26]:
df_final = (pd.merge(df, ratings, left_on='id', right_on='user_id', how='inner')
            .filter(items=df.columns.to_list() + ['film_id', 'rate'])
           )

In [27]:
df_final

Unnamed: 0,id,creation_year,early_bird,night_owl,prefered_hour,weekend_tweeter,week_tweeter,prefered_weekday,friends_count,followers_count,favourites_count,geo_enabled,tweets_count,film_id,rate
0,35252,2008,False,False,18,False,True,6,87,124,21,True,575,1374992,5
1,35253,2014,False,False,21,False,True,6,67,522,314,True,1089,2980516,7
2,35257,2011,False,False,20,False,True,6,399,72,194,True,718,460740,6
3,35257,2011,False,False,20,False,True,6,399,72,194,True,718,1764651,6
4,35257,2011,False,False,20,False,True,6,399,72,194,True,718,2101441,6
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
486255,19908,2015,False,False,17,False,True,6,1257,171,1203,True,1965,7131622,9
486256,19908,2015,False,False,17,False,True,6,1257,171,1203,True,1965,7286456,10
486257,19913,2013,False,False,18,False,True,2,112,22,1069,True,3560,12361974,10
486258,19913,2013,False,False,18,False,True,2,112,22,1069,True,3560,5034838,7


In [28]:
df_final.to_csv('dataset.csv')