
# 小地域の収入推定

```Data```フォルダには、小地域の収入データ(```income_district.csv```)と国勢調査の小地域集計データが格納されています。

国勢調査の小地域集計データ（人口構成、労働力構成、住宅形態など）から、その地域の収入を推定することが考えられます。推定のための機械学習・深層学習モデルを構築しなさい。


- データを観察・理解する上で、データの構造を説明しながら、適切なデータ整形を行いなさい
- データ構造や分析結果に対して、少なくとも二つの図で可視化を行いなさい
- モデルの精度を評価し、できるだけ精度が高いモデルを得るよう、適切な特徴量エンジニアリングやモデル選定の考えもまとめなさい


In [98]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

In [99]:
# of no use
# marriage = pd.read_csv('Data/h27_marriage_df.csv', index_col=0)
# marriage.drop(['district2_id', 'level_identifier', 'state_name', 'city_name', 'district_name', 'district2_name'], axis=1,
#          inplace=True)
# marriage.head()

In [100]:
# ignore, partly represented by job
# industry = pd.read_csv('Data/h27_indusry_df.csv', index_col=0)
# industry.drop(['district2_id', 'level_identifier', 'state_name', 'city_name', 'district_name', 'district2_name'], axis=1, inplace=True)
# industry.head()

In [101]:
# ignore, family cannot influence income
# family = pd.read_csv('Data/h27_family_df.csv',index_col=0)
# family.drop(['district2_id', 'level_identifier', 'state_name', 'city_name', 'district_name', 'district2_name'], axis=1, inplace=True)
# family.head()

In [102]:
# more labor will not result in higher average income, I quit this 
# labor = pd.read_csv('Data/h27_labor_df.csv', index_col=0)
# labor.drop(['district2_id', 'level_identifier', 'state_name', 'city_name', 'district_name', 'district2_name'], axis=1,
#          inplace=True)
# labor.head()

In [103]:
#ignore, the same reason as house 
# house_info = pd.read_csv('Data/h27_house_info_df.csv', index_col=0)
# house_info.drop(['district2_id', 'level_identifier', 'state_name', 'city_name', 'district_name', 'district2_name'], axis=1, inplace=True)
# house_info.head()

In [104]:
# ignore, the outcome of income, not cause
# house = pd.read_csv('Data/h27_house_df.csv', index_col=0)
# house.drop(['district2_id', 'level_identifier', 'state_name', 'city_name', 'district_name', 'district2_name'], axis=1, inplace=True)
# house.head()

In [105]:
# ignore
# status = pd.read_csv('Data/h27_work_status_df.csv', index_col=0)
# status.drop(['district2_id', 'level_identifier', 'state_name', 'city_name', 'district_name', 'district2_name'], axis=1,
#          inplace=True)
# status.head()

In [106]:
income = pd.read_csv('Data/income_district.csv', index_col=0)
income.index.name = 'district_id'
income.head()

Unnamed: 0_level_0,income_mean
district_id,Unnamed: 1_level_1
13228,480.984419
43514,439.775281
23237,487.507769
18208,505.495103
12238,385.423846


In [107]:
age= pd.read_csv('Data/h27_age_df.csv', index_col=0)
age.drop(['district2_id', 'level_identifier', 'state_name', 'city_name', 'district_name', 'district2_name'], axis=1, inplace=True)
age = age.iloc[:, 0:19]
age.replace('-', 0, inplace=True)
age.replace('X', 0, inplace=True)
age = age.apply(pd.to_numeric)
age = age.groupby('district_id').sum()
age['10-24'] = age.iloc[:, 0:3].sum(axis=1)
age['25-65'] = age.iloc[:, 3:10].sum(axis=1)
age['65-100'] = age.iloc[:, 10:18].sum(axis=1)
age1 = age[['10-24','25-65','65-100']]
age1.head()

Unnamed: 0_level_0,10-24,25-65,65-100
district_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1101,93929,367568,193104
1102,131047,376469,265375
1103,108694,372350,236721
1104,79630,311448,183211
1105,88274,321206,196055


In [108]:
# temporily do not use
# gender = pd.read_csv('Data/h27_gender_df2.csv', index_col=0)
# gender.drop(['district2_id', 'level_identifier', 'state_name', 'city_name', 'district_name', 'district2_name'], axis=1, inplace=True)
# gender.replace('-', 0, inplace=True)
# gender.replace('X', 0, inplace=True)
# gender.drop('family_count', axis=1, inplace=True)
# gender = gender.apply(pd.to_numeric)
# gender = gender.groupby('district_id').sum()
# gender.drop('population', axis=1, inplace=True)
# gender.head()

In [109]:
job = pd.read_csv('Data/h27_job_df.csv', index_col=0)
job.drop(['district2_id', 'level_identifier', 'state_name', 'city_name', 'district_name', 'district2_name'], axis=1, inplace=True)
job.replace('-', 0, inplace=True)
job.replace('X', 0, inplace=True)
job = job.apply(pd.to_numeric)
job = job.groupby('district_id').sum()
job['advanced_male'] = age.iloc[:, 0:3].sum(axis=1)
job['service_male'] = age.iloc[:, 3:5].sum(axis=1)
job['basic_male'] = age.iloc[:, 5:12].sum(axis=1)
job['advanced_female'] = age.iloc[:, 12:15].sum(axis=1)
job['service_female'] = age.iloc[:, 12:15].sum(axis=1)
job['basic_female'] = age.iloc[:, 15:22].sum(axis=1)
job1 = job[['advanced_male','service_male','basic_male', 'advanced_female', 'service_female', 'basic_female']]
job1.head()

Unnamed: 0_level_0,advanced_male,service_male,basic_male,advanced_female,service_female,basic_female
district_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1101,93929,102595,352935,81292,81292,678827
1102,131047,92209,409661,112563,112563,800611
1103,108694,97668,386695,100226,100226,742543
1104,79630,84233,311737,79798,79798,593416
1105,88274,88850,320061,86066,86066,628068


In [110]:
dataset = pd.concat([age1, job1], axis=1)

In [111]:
merged = pd.merge(income, dataset, on='district_id')
print(merged)

             income_mean  10-24   25-65  65-100  advanced_male  service_male  \
district_id                                                                    
13228         480.984419  27715   79480   63145          27715         17364   
43514         439.775281   3522   11266   13446           3522          2438   
23237         487.507769  29662   89979   61769          29662         20718   
18208         505.495103   9627   30417   28751           9627          7137   
12238         385.423846   8094   28608   35704           8094          5954   
...                  ...    ...     ...     ...            ...           ...   
14137         601.266088  79040  244090  141813          79040         55254   
2204          379.073909  10629   32459   29236          10629          7314   
16207         531.328234  10198   33806   29902          10198          7866   
13107         512.674825  87309  401970  217770          87309        121872   
8208          476.672694  31532   85621 

In [123]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scatterd_1 =  scaler.fit_transform(merged[["10-24"]])
scatterd_2 =  scaler.fit_transform(merged[["25-65"]])
scatterd_3 =  scaler.fit_transform(merged[["65-100"]])
scatterd_4 =  scaler.fit_transform(merged[["advanced_male"]])
scatterd_5 =  scaler.fit_transform(merged[["service_male"]])
scatterd_6 =  scaler.fit_transform(merged[["basic_male"]])
scatterd_7 =  scaler.fit_transform(merged[["advanced_female"]])
scatterd_8 =  scaler.fit_transform(merged[["service_female"]])
scatterd_9 =  scaler.fit_transform(merged[["basic_female"]])
scatterd_income =  scaler.fit_transform(merged[["income_mean"]])
merged2 = np.column_stack((scatterd_1, scatterd_2, scatterd_3, scatterd_4, scatterd_5, scatterd_6, scatterd_7, scatterd_8, scatterd_9, scatterd_income))
X = merged2[:, 0:9]
y = merged2[:, 9]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')


Mean Squared Error: 0.014055924870374185
R^2 Score: 0.25922237907298584
