In [23]:
import pandas as pd

## Pre processing population dataset

In [24]:
pop = pd.read_csv('Datasets/raw/Population_raw.csv', sep=';')
pop_melt = pop.melt(id_vars='Arrondissement')
pop_melt.columns = ['Arrondissement', 'Année', 'Population']
pop_melt.to_csv('Datasets/processed/Population_Processed.csv', sep=';', index=False)

## Pre processing price dataset

In [25]:
data = pd.read_csv('Datasets/raw/Prices_raw.csv')
data.drop(columns="Unnamed: 0", inplace = True)
data.drop([46,47,48,49,85], inplace = True)

arrondissement = []
for i in range(0,20):
    arrondissementNum = '{:02}'.format(i+1)
    arrondissement.append((int('750' + str(arrondissementNum))))

data.columns = arrondissement+['Quarter','Year']

df1 = data.melt(id_vars=['Quarter', 'Year'])

df1.Quarter.replace(
                    to_replace=['T1', 'T2', 'T3', 'T4'],
                    value = ['Q1', 'Q2', 'Q3', 'Q4'],
                    inplace = True)

df1['Pays'] = 'France'

df1.columns = ['Quarter', 'Année', 'Arrondissement', 'Prix', 'Pays']

df1.to_csv('Datasets/processed/Prices_Processed.csv', index=False, sep=';')

## Creating Density dataset

In [26]:
population = pd.read_csv('Datasets/processed/Population_Processed.csv', sep = ';')
area = pd.read_csv('Datasets/processed/Area.csv', sep=';')

In [27]:
area['Superficie'] = area['Superficie'].apply(lambda x: x.replace(',','.'))
area['Superficie'] = pd.to_numeric(area['Superficie'])

In [28]:
Density = pd.merge(population, area, left_on=['Arrondissement'], right_on=['Arrondissement'], how='left')

In [29]:
row_to_drop = Density[(Density['Année'] == 2009) | (Density['Année'] == 2011)].index
Density['Density'] = Density['Population']/Density['Superficie']
Density['Country'] = 'France'
Density_evo = Density.drop(row_to_drop)
Density_evo.to_csv('Datasets/processed/Density_Processed.csv')

## Creating a Dataset that corrolate price and density

In [30]:
#Loading datas
prix = pd.read_csv('Datasets/processed/Prices_Processed.csv', sep=';')

#Merging DataFrame
density_population = pd.merge(prix, Density, left_on=['Arrondissement', 'Année'], right_on=['Arrondissement', 'Année'], how='left')
density_population = density_population[(density_population['Density'].notnull()) & (density_population['Quarter'] == 'Q1')]

density_population['Prix'] = density_population['Prix'].str.replace(' ', '')
density_population['Prix'] = pd.to_numeric(density_population['Prix'])

prix_density_corr = pd.DataFrame()

# Creating an array of the district numbers
arrondissementNumber = []
for i in range(0,20):
    arrondissementNum = '{:02}'.format(i+1)
    arrondissementNumber.append((int('750' + str(arrondissementNum))))

#Comparing the max density and max price year
for arrondissementNum in arrondissementNumber:
    byDistrict = density_population[density_population['Arrondissement'] == arrondissementNum]
    byDistrict['Density'] = byDistrict['Density'].astype(int)
    
    min_price = byDistrict[byDistrict['Prix'] == byDistrict['Prix'].min()] 
    max_density = byDistrict[byDistrict['Density'] == byDistrict['Density'].max()]
    
    if(min_price.equals(max_density)):
        new_row = {'Arrondissement'  : [arrondissementNum] , 'is_corr' : [True]}
        prix_density_corr = pd.concat([prix_density_corr, pd.DataFrame(new_row)])
    else:
        new_row = {'Arrondissement'  : [arrondissementNum] , 'is_corr' : [False]}
        prix_density_corr = pd.concat([prix_density_corr, pd.DataFrame(new_row)])
        

prix_density_corr['Country'] = 'France'
prix_density_corr.to_csv('Datasets/processed/Correlation.csv')
prix_density_corr.head(5)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,Arrondissement,is_corr,Country
0,75001,True,France
0,75002,False,France
0,75003,False,France
0,75004,True,France
0,75005,True,France


## Creating one dataset for "big four" district and one for others

In [31]:
corr = prix_density_corr[prix_density_corr['is_corr'] == True]['Arrondissement'].values

#Constructing ywo arrays : one with the zipcode of the big four and one with the others
corr = prix_density_corr[prix_density_corr['is_corr'] == True]['Arrondissement'].values
not_big_four_num = []
for arrondissementNum in arrondissementNumber:
    if arrondissementNum not in corr:
        not_big_four_num.append(arrondissementNum)    

#Constructiong our two dataframes
big_four = pd.DataFrame()
not_big_four = pd.DataFrame()

for district in corr:
    district = density_population[density_population['Arrondissement'] == district]
    density_2006 = district[district['Année'] == 2006]
    density_2017 = district[district['Année'] == 2017]
    big_four = pd.concat([big_four, density_2006])
    big_four = pd.concat([big_four, density_2017])
    

for district in not_big_four_num:
    district = density_population[density_population['Arrondissement'] == district]
    density_2006 = district[district['Année'] == 2006]
    density_2017 = district[district['Année'] == 2017]
    not_big_four = pd.concat([not_big_four, density_2006])
    not_big_four = pd.concat([not_big_four, density_2017])
    

## Creating evolution features

In [32]:
#big four evolution
big_four_density_2006 = big_four[big_four['Année'] == 2006]['Density'].mean()
big_four_density_2017 = big_four[big_four['Année'] == 2017]['Density'].mean()
not_big_four_density_2006 = not_big_four[not_big_four['Année'] == 2006]['Density'].mean()
not_big_four_density_2017 = not_big_four[not_big_four['Année'] == 2017]['Density'].mean()

big_four_density_evo = (big_four_density_2006 - big_four_density_2017)/big_four_density_2006 *100
not_big_four_density_evo = (not_big_four_density_2006 - not_big_four_density_2017)/not_big_four_density_2006*100

#not_big_four_evolution
big_four_price_2006 = big_four[big_four['Année'] == 2006]['Prix'].mean()
big_four_price_2017 = big_four[big_four['Année'] == 2017]['Prix'].mean()
not_big_four_price_2006 = not_big_four[not_big_four['Année'] == 2006]['Prix'].mean()
not_big_four_price_2017 = not_big_four[not_big_four['Année'] == 2017]['Prix'].mean()

big_four_price_evo = (big_four_price_2017 - big_four_price_2006)/big_four_price_2006*100
not_big_four_price_evo = (not_big_four_price_2017 - not_big_four_price_2006)/not_big_four_price_2006*100

In [33]:
viz = pd.DataFrame({'District' : ['Big Four', 'Not Big Four'], 
                    'Price Growth' : [big_four_price_evo, not_big_four_price_evo], 
                    'Density Loss' : [big_four_density_evo, not_big_four_density_evo]})

viz.set_index('District')

Unnamed: 0_level_0,Price Growth,Density Loss
District,Unnamed: 1_level_1,Unnamed: 2_level_1
Big Four,60.23085,4.755351
Not Big Four,57.725714,-0.629248


In [34]:
big_four[big_four['Année'] == 2017]['Prix'].mean()

11452.5