### Import statements

In [1]:
#import statements
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime
import plotly
from plotly.graph_objs import *
import plotly.offline as plot
import plotly.graph_objs as go
plot.offline.init_notebook_mode(connected=True)

In [2]:
#Loading data into pandas
data_user = pd.read_table('u.user', header=None, sep="|")
data_user.columns = ['user_id', 'age', 'gender', 'occupation', 'zip code']
data_udata = pd.read_table('u.data', header=None, delim_whitespace=True)
data_udata.columns = ['user_id', 'item id', 'rating', 'timestamp']
print(data_user)

     user_id  age gender     occupation zip code
0          1   24      M     technician    85711
1          2   53      F          other    94043
2          3   23      M         writer    32067
3          4   24      M     technician    43537
4          5   33      F          other    15213
5          6   42      M      executive    98101
6          7   57      M  administrator    91344
7          8   36      M  administrator    05201
8          9   29      M        student    01002
9         10   53      M         lawyer    90703
10        11   39      F          other    30329
11        12   28      F          other    06405
12        13   47      M       educator    29206
13        14   45      M      scientist    55106
14        15   49      F       educator    97301
15        16   21      M  entertainment    10309
16        17   30      M     programmer    06355
17        18   35      F          other    37212
18        19   40      M      librarian    02138
19        20   42   

In [3]:
print(data_udata)

       user_id  item id  rating  timestamp
0          196      242       3  881250949
1          186      302       3  891717742
2           22      377       1  878887116
3          244       51       2  880606923
4          166      346       1  886397596
5          298      474       4  884182806
6          115      265       2  881171488
7          253      465       5  891628467
8          305      451       3  886324817
9            6       86       3  883603013
10          62      257       2  879372434
11         286     1014       5  879781125
12         200      222       5  876042340
13         210       40       3  891035994
14         224       29       3  888104457
15         303      785       3  879485318
16         122      387       5  879270459
17         194      274       2  879539794
18         291     1042       4  874834944
19         234     1184       2  892079237
20         119      392       4  886176814
21         167      486       4  892738452
22         

### Distribution of age group

In [4]:
data = [go.Histogram(x = data_user.age)]
layout = dict(xaxis = dict(title = 'Age'),
              yaxis = dict(title='frequency'),
             )
fig = dict(data = data, layout = layout)
plot.iplot(fig, filename = 'Age Distribution')

### Rating Histogram

In [5]:
data = [go.Histogram(x = data_udata.rating)]
layout = dict(xaxis = dict(title = 'rating'),
              yaxis = dict(title = 'frequency'),
             )
fig = dict(data = data, layout = layout)
plot.iplot(fig, filename = 'rating histogram')

### Average Rating over the year

In [6]:
u_data = data_udata.copy()
u_data['timestamp'] = pd.to_datetime(u_data['timestamp'],unit = 's')
u_data['date'] = pd.DatetimeIndex(u_data['timestamp']).date
u_data.head()

Unnamed: 0,user_id,item id,rating,timestamp,date
0,196,242,3,1997-12-04 15:55:49,1997-12-04
1,186,302,3,1998-04-04 19:22:22,1998-04-04
2,22,377,1,1997-11-07 07:18:36,1997-11-07
3,244,51,2,1997-11-27 05:02:03,1997-11-27
4,166,346,1,1998-02-02 05:33:16,1998-02-02


In [7]:
average_date_rating = pd.DataFrame(u_data.groupby(['date']).rating.mean(), index = None)


In [8]:
trace0 = Scatter(
                x = average_date_rating.index,
                y = average_date_rating.rating
                )
data = [trace0]
layout = dict(xaxis = dict(title = 'date'),
              yaxis = dict(title = 'average rating'),
             )
fig = dict(data = data, layout = layout)
plot.iplot(fig, filename = 'average rating over the years')

### Average rating of male and female over the different age groups

In [9]:
df_merged_user_data = pd.merge(data_user, data_udata, on = 'user_id', how = 'outer')
df_merged_user_data.drop(['zip code'], axis = 1, inplace = True)
df_merged_user_data

Unnamed: 0,user_id,age,gender,occupation,item id,rating,timestamp
0,1,24,M,technician,61,4,878542420
1,1,24,M,technician,189,3,888732928
2,1,24,M,technician,33,4,878542699
3,1,24,M,technician,160,4,875072547
4,1,24,M,technician,20,4,887431883
5,1,24,M,technician,202,5,875072442
6,1,24,M,technician,171,5,889751711
7,1,24,M,technician,265,4,878542441
8,1,24,M,technician,155,2,878542201
9,1,24,M,technician,117,3,874965739


In [10]:
df_age_rating = df_merged_user_data.groupby(['age','gender']).rating.mean()
df_age_rating = df_age_rating.unstack()
df_age_rating.reset_index(inplace = True)
df_age_rating

gender,age,F,M
0,7,,3.767442
1,10,,3.387097
2,11,,2.925926
3,13,3.144928,3.523364
4,14,3.375000,
5,15,3.415094,2.962121
6,16,3.690476,3.211155
7,17,3.471698,3.522124
8,18,3.805375,3.593860
9,19,3.314574,3.422191


In [11]:
bins = [0,10,20,30,40,50,60,70,80,90,100]
group_names = ['0-10', '11-20', '21-30', '31-40', '41-50', '51-60', '61-70', '71-80', '81-90', '91-100']

In [12]:
df_age_rating['categories'] = pd.cut(df_age_rating['age'], bins, labels=group_names)

In [13]:
df_age_rating.columns

Index(['age', 'F', 'M', 'categories'], dtype='object', name='gender')

In [51]:
df_age_rating_new = df_age_rating.groupby('categories').mean()
df_age_rating_new

gender,age,F,M
categories,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0-10,8.5,,3.577269
11-20,15.888889,3.497686,3.349423
21-30,25.5,3.432614,3.447094
31-40,35.5,3.664806,3.523024
41-50,45.5,3.540161,3.632427
51-60,55.5,3.719219,3.715564
61-70,65.333333,3.250153,3.572752
71-80,73.0,,3.982143
81-90,,,
91-100,,,


In [52]:
trace0 = Bar(
             x = df_age_rating_new.index,
             y = df_age_rating_new.F,
             name = 'Female')
trace1 = Bar(
             x = df_age_rating_new.index,
             y = df_age_rating_new.M, 
             name = 'Male')
data = [trace0, trace1]
layout = dict(xaxis = dict(title = 'Age Range' ),
              yaxis = dict(title = 'average rating'),
              barmode = 'stack')
fig = dict(data = data, layout = layout)
plot.iplot(fig, 'Gender ratings Age analysis')


### Occupation vs avg rating

In [54]:
occupation_rating = pd.DataFrame(df_merged_user_data.groupby(['occupation']).rating.mean(), index = None)

trace0 = Bar(
        x = occupation_rating.index,
        y = occupation_rating.rating,
        
)
data = [trace0]
layout = dict(xaxis = dict(title = 'Age'),
              yaxis = dict(title = 'Average Rating'),
              barmode='group'
              )
fig = dict(data = data, layout = layout)
plot.iplot(fig, filename='Average rating vs occupation')