In [None]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats

import warnings

warnings.filterwarnings('ignore')

# Download data from prod using the queries in the user_attributes.query file 
df = pd.read_csv('user_att.csv')
df2 = pd.read_csv('freq_btwn_rentals.csv')

# merge two data frames 
user_attributes = pd.merge(df, df2, on='customer_id', how='outer')

# rename column for better clarity 
user_attributes =  user_attributes.rename(columns={'date_part': 'avg days between rentals'})

# convert seconds value to number of days 
user_attributes['Time from last rental '] = user_attributes['Time from last rental '] /60/60/24
user_attributes['days from registration'] = user_attributes['days from registration'] /60/60/24
user_attributes['avg days between rentals'] = user_attributes['avg days between rentals'] /60/60/24

# Get extra values of loyalty and fare per rental
user_attributes['loyalty'] = (user_attributes['days from registration'] - user_attributes['Time from last rental ']) 
user_attributes['fare per rental'] = user_attributes['fare'] / user_attributes['rental_count']

# Update all None values to 0
user_attributes = user_attributes.fillna(0)

# Cast values as integers instead of floats
user_attributes_no_float = user_attributes.astype(int)

# remove outliers
user_attributes_no_outlier = user_attributes_no_float[(np.abs(stats.zscore(user_attributes_no_float)) < len(user_attributes.columns)).all(axis=1)]

# ranking the attributes by the 25%, 50% and 75% averages 
user_attributes_no_outlier['rental_count_rank'] = pd.qcut(user_attributes_no_outlier.rental_count, 4, labels=[1,2,3,4])
user_attributes_no_outlier['fare_rank'] = pd.qcut(user_attributes_no_outlier.fare, 4, labels=[1,2,3,4])
user_attributes_no_outlier['time_from_last_rental_rank'] = pd.qcut(user_attributes_no_outlier['Time from last rental '], 4, labels=[6,3,0,-3])
user_attributes_no_outlier['days from registration_rank'] = pd.qcut(user_attributes_no_outlier['days from registration'], 4, labels=[1,2,3,4])
user_attributes_no_outlier['credit_amount_rank'] = pd.qcut(user_attributes_no_outlier['credit amount'], 4, labels=[4,3,1], duplicates='drop')
user_attributes_no_outlier['total_credits_used_rank'] = pd.qcut(user_attributes_no_outlier['total credits used'], 4, labels=[4,3,1], duplicates='drop')
user_attributes_no_outlier['avg_time_between_rentals_rank'] = pd.qcut(user_attributes_no_outlier['avg days between rentals'],4, labels=[4,3,1], duplicates='drop')
user_attributes_no_outlier['fare_per_rental_rank'] = pd.qcut(user_attributes_no_outlier['fare per rental'],4, labels=[1,2,3,4])
user_attributes_no_outlier['loyalty_rank'] = pd.qcut(user_attributes_no_outlier['loyalty'],4, labels=[1,2,3,4])

# averaging monetary_score that is calculated using mulitple values 
user_attributes_no_outlier['monetary_score'] = user_attributes_no_outlier[['fare_rank','credit_amount_rank' , 'total_credits_used_rank', 'fare_per_rental_rank']].mean(axis=1)

# averaging all ranks into a customer rank
user_attributes_no_outlier['customer_rank'] = user_attributes_no_outlier[['time_from_last_rental_rank', 'avg_time_between_rentals_rank', 'loyalty_rank', 'monetary_score']].mean(axis=1)

# set the index to be the customer_id
user_attributes_no_outlier = user_attributes_no_outlier.set_index('customer_id')

# 
user_attributes_no_outlier.to_csv('./customer_score.csv', index=False)
user_attributes_no_outlier