import numpy as np
from matplotlib import pyplot as plt
import matplotlib.mlab as mlab
import pandas as pd
from scipy.stats import norm
df=pd.read_csv("./data/steam.csv")

Problématique : Le prix d'un jeu vidéo a-t-il une influence significative et systématique sur les ventes de ce dernier ?

## Partie 1

In [39]:
#On lance le csv pour voir si le chargement est fonctionnel
dfp=pd.read_csv("./data/steam.csv",usecols = ['appid','name','release_date','owners','price'])
dfp.head(5)


Unnamed: 0,appid,name,release_date,owners,price
0,10,Counter-Strike,2000-11-01,10000000-20000000,7.19
1,20,Team Fortress Classic,1999-04-01,5000000-10000000,3.99
2,30,Day of Defeat,2003-05-01,5000000-10000000,3.99
3,40,Deathmatch Classic,2001-06-01,5000000-10000000,3.99
4,50,Half-Life: Opposing Force,1999-11-01,5000000-10000000,3.99


In [40]:
#On describe pour voir s'il existe des données aberrantes
dfp.describe()


Unnamed: 0,appid,price
count,27075.0,27075.0
mean,596203.5,6.078193
std,250894.2,7.874922
min,10.0,0.0
25%,401230.0,1.69
50%,599070.0,3.99
75%,798760.0,7.19
max,1069460.0,421.99


In [41]:
#On remarque que le maximum en prix se trouve à 400 $, qui est loin de la moyenne des prix des jeux. On fait donc une boite à moustache pour pouvoir voir le nombre d'aberration
plt.figure(figsize=(10,5))
dfp.boxplot(column='price')
plt.title("prix des jeux ")

plt.show()

<IPython.core.display.Javascript object>

In [42]:
#On remarque qu'il existe quelques jeuc à plus de 80$, qui est le maximum de prix standard, on va donc nettoyer le dataset de ces données

indexNames = dfp[ (dfp['price'] >= 60)].index
dfp.drop(indexNames , inplace=True)
plt.figure(figsize=(10,5))
dfp.boxplot(column='price')
plt.title("prix des jeux ")

plt.show()

<IPython.core.display.Javascript object>

In [43]:

dfp[['min_o','max_o']]=(dfp['owners'].str.split('-',1,expand=True))
dfp[['min_o','max_o']]=dfp[['min_o','max_o']].astype(int)

dfp.head(5)

Unnamed: 0,appid,name,release_date,owners,price,min_o,max_o
0,10,Counter-Strike,2000-11-01,10000000-20000000,7.19,10000000,20000000
1,20,Team Fortress Classic,1999-04-01,5000000-10000000,3.99,5000000,10000000
2,30,Day of Defeat,2003-05-01,5000000-10000000,3.99,5000000,10000000
3,40,Deathmatch Classic,2001-06-01,5000000-10000000,3.99,5000000,10000000
4,50,Half-Life: Opposing Force,1999-11-01,5000000-10000000,3.99,5000000,10000000


In [44]:
dfp['owners']=dfp['max_o']-dfp['min_o']
dfp.head(5)

Unnamed: 0,appid,name,release_date,owners,price,min_o,max_o
0,10,Counter-Strike,2000-11-01,10000000,7.19,10000000,20000000
1,20,Team Fortress Classic,1999-04-01,5000000,3.99,5000000,10000000
2,30,Day of Defeat,2003-05-01,5000000,3.99,5000000,10000000
3,40,Deathmatch Classic,2001-06-01,5000000,3.99,5000000,10000000
4,50,Half-Life: Opposing Force,1999-11-01,5000000,3.99,5000000,10000000


In [45]:
%matplotlib notebook

In [46]:
std = np.std(dfp.price, ddof=1)
mean = np.mean(dfp.price)
domain = np.linspace(np.min(dfp.price),np.max(dfp.price))
plt.plot(domain,norm.pdf(domain,mean,std))
plt.hist(dfp.price,edgecolor = 'black',bins=60, alpha = 0.5,density=True)
plt.xlabel('prix des jeux')
plt.ylabel('densité')
plt.show()

<IPython.core.display.Javascript object>

In [47]:
plt.figure(figsize=(10,5))
dfp.boxplot(column='owners')
plt.title("nb joueurs ")

plt.show()

<IPython.core.display.Javascript object>

In [48]:
#On verifie si la date possede le bon type
dfp.release_date.dtype

dtype('O')

In [49]:
#On transforme donc la colonne en datetype
dfp['release_date'] = pd.to_datetime(dfp['release_date'])
print(dfp['release_date'])

0       2000-11-01
1       1999-04-01
2       2003-05-01
3       2001-06-01
4       1999-11-01
           ...    
27070   2019-04-24
27071   2019-04-23
27072   2019-04-24
27073   2019-04-17
27074   2019-04-24
Name: release_date, Length: 27045, dtype: datetime64[ns]


In [50]:
import seaborn as sns
dataStorecorr = dfp[['owners', 'price','max_o','min_o']]
dataStorecorr.corr()

Unnamed: 0,owners,price,max_o,min_o
owners,1.0,0.041029,0.997674,0.990174
price,0.041029,1.0,0.040006,0.038724
max_o,0.997674,0.040006,1.0,0.997403
min_o,0.990174,0.038724,0.997403,1.0


In [51]:
plt.figure(figsize = (8, 8))
sns.heatmap(dataStorecorr.corr(), annot = True, cmap = "coolwarm")

<IPython.core.display.Javascript object>

<matplotlib.axes._subplots.AxesSubplot at 0x2464d718>

## Partie 2