# 出会い系大手４サイトの統計データ解析

In [1]:
import pandas as pd
import datetime
import warnings
warnings.simplefilter('ignore', pd.core.common.SettingWithCopyWarning)

In [2]:
from sqlalchemy import create_engine
url = "mysql+mysqldb://automatching:reiwa@localhost/auto_matching?charset=utf8mb4"
engine = create_engine(url)

In [3]:
query = """
select
  id, title, name, age, posted_at, prefecture, city, site, genre
from posts
limit 10000;
"""

df = pd.read_sql(query, engine)

In [4]:
len(df)

2938

## 前処理

In [5]:
happymail_df = df.query('site == "ハッピーメール"')
wakuwaku_df = df.query('site == "ワクワクメール"')
pcmax_df = df.query('site == "PCMAX"')
ikukuru_df = df.query('site == "イククル"')

In [6]:
GENRE_SUGU = "すぐ会いたい"
GENRE_NOT_SUGU = "すぐじゃないけど"
GENRE_ADULT_H = "アダルト"
GENRE_ADULT_LOVE = "大人の恋愛"
GENRE_ABNORMAL = "アブノーマル"
GENRE_MIDDLE_AGE = "ミドルエイジ"
GENRE_OTHER = "その他"

In [7]:
AGE_UNDER_20 = "18-19歳"
AGE_20_23 = "20代前半"
AGE_24_26 = "20代半ば"
AGE_27_29 = "20代後半"
AGE_30_33 = "30代前半"
AGE_34_36 = "30代半ば"
AGE_37_39 = "30代後半"
AGE_40_43 = "40代前半"
AGE_44_46 = "40代半ば"
AGE_47_49 = "40代後半"
AGE_50_53 = "50代前半"
AGE_54_56 = "50代半ば"
AGE_57_59 = "50代後半"
AGE_OVER_60 = "60代前半"

### ハッピーメール

In [8]:
def convert_age(x):
    if x == "18〜19歳":
        return AGE_UNDER_20
    else:
        return x
happymail_df['age'] = happymail_df['age'].apply(convert_age)

In [9]:
def convert_genre(x):
    if x == '今ｽｸﾞ会いたい':
        return GENRE_SUGU
    elif x == '大人の出会い':
        return GENRE_ADULT_H
    elif x == '大人の恋人関係':
        return GENRE_ADULT_LOVE 
    elif x == '刺激的な出会い':
        return GENRE_ABNORMAL
    elif x == 'ﾐﾄﾞﾙｴｲｼﾞ/ｼﾆｱ':
        return GENRE_MIDDLE_AGE
    else:
        return GENRE_OTHER
happymail_df['genre'] = happymail_df['genre'].apply(convert_genre)

### ワクワクメール

In [10]:
def convert_genre(x):
    if x == 'すぐ会いたい':
        return GENRE_SUGU
    elif x == '今日じゃないけど...':
        return GENRE_NOT_SUGU
    elif x == 'アダルト・H':
        return GENRE_ADULT_H
    elif x == '大人の恋人候補':
        return GENRE_ADULT_LOVE 
    elif x == 'アブノーマル':
        return GENRE_ABNORMAL
    elif x == 'ミドルエイジ':
        return GENRE_MIDDLE_AGE
    else:
        return GENRE_OTHER
wakuwaku_df['genre'] = wakuwaku_df['genre'].apply(convert_genre)

### PCMAX

In [11]:
def convert_age(x):
    age = int(x.replace('歳', ''))
    if age < 20:
        return AGE_UNDER_20
    elif 20 <= age < 24:
        return AGE_20_23
    elif 24 <= age < 27:
        return AGE_24_26
    elif 27 <= age < 30:
        return AGE_27_29
    elif 30 <= age < 34:
        return AGE_30_33
    elif 34 <= age < 37:
        return AGE_34_36
    elif 37 <= age < 40:
        return AGE_37_39
    elif 40 <= age < 44:
        return AGE_40_43
    elif 44 <= age < 47:
        return AGE_44_46
    elif 47 <= age < 50:
        return AGE_47_49
    elif 50 <= age < 54:
        return AGE_50_53
    elif 54 <= age < 57:
        return AGE_54_56
    elif 57 <= age < 60:
        return AGE_57_59
    else:
        return AGE_OVER_60
pcmax_df['age'] = pcmax_df['age'].apply(convert_age)

In [12]:
def convert_genre(x):
    if x == 'スグ会いたい':
        return GENRE_SUGU
    elif x == 'スグじゃないけど':
        return GENRE_NOT_SUGU
    elif x == 'アブノーマル・SM':
        return GENRE_ABNORMAL
    elif x == '既婚者':
        return GENRE_MIDDLE_AGE
    else:
        return GENRE_OTHER
pcmax_df['genre'] = pcmax_df['genre'].apply(convert_genre)

### イククル

In [13]:
def convert_age(x):
    age = int(x.replace('歳', ''))
    if age < 20:
        return AGE_UNDER_20
    elif 20 <= age < 24:
        return AGE_20_23
    elif 24 <= age < 27:
        return AGE_24_26
    elif 27 <= age < 30:
        return AGE_27_29
    elif 30 <= age < 34:
        return AGE_30_33
    elif 34 <= age < 37:
        return AGE_34_36
    elif 37 <= age < 40:
        return AGE_37_39
    elif 40 <= age < 44:
        return AGE_40_43
    elif 44 <= age < 47:
        return AGE_44_46
    elif 47 <= age < 50:
        return AGE_47_49
    elif 50 <= age < 54:
        return AGE_50_53
    elif 54 <= age < 57:
        return AGE_54_56
    elif 57 <= age < 60:
        return AGE_57_59
    else:
        return AGE_OVER_60
ikukuru_df['age'] = ikukuru_df['age'].apply(convert_age)

In [14]:
def convert_genre(x):
    if x == 'すぐ会いたい':
        return GENRE_SUGU
    elif x == 'ヒミツＨ!秘密だよ!':
        return GENRE_ADULT_H
    elif x == 'アブノーマル':
        return GENRE_ABNORMAL
    elif x == 'ミドルエイジ':
        return GENRE_MIDDLE_AGE
    elif x == '既婚者希望':
        return GENRE_MIDDLE_AGE
    else:
        return GENRE_OTHER
ikukuru_df['genre'] = ikukuru_df['genre'].apply(convert_genre)

In [15]:
df = pd.concat([happymail_df, wakuwaku_df, pcmax_df, ikukuru_df])

In [16]:
len(df)

2938

## データ分析

### サイト別

In [21]:
output = pd.DataFrame(df.groupby("site").count()["id"])
output.index.name = "サイト"
output = output.rename(columns = {"id": "投稿数"})
output.sort_values("投稿数", ascending=False)

Unnamed: 0_level_0,投稿数
サイト,Unnamed: 1_level_1
ワクワクメール,1175
PCMAX,797
イククル,586
ハッピーメール,380


### ジャンル別

In [23]:
output = pd.DataFrame(df.groupby("genre").count()["id"])
output.index.name = "ジャンル"
output = output.rename(columns = {"id": "投稿数"})
output.sort_values("投稿数", ascending=False)

Unnamed: 0_level_0,投稿数
ジャンル,Unnamed: 1_level_1
すぐ会いたい,1676
アダルト,392
すぐじゃないけど,268
ミドルエイジ,236
アブノーマル,205
大人の恋愛,144
その他,17


### 地域別

In [19]:
output = pd.DataFrame(df.groupby("city").count()["id"])
output.index.name = "地域"
output = output.rename(columns = {"id": "投稿数"})
output.sort_values("投稿数", ascending=False).head(20)

Unnamed: 0_level_0,投稿数
地域,Unnamed: 1_level_1
,358
新宿区,351
豊島区,276
渋谷区,130
横浜市西区,121
横浜市中区,87
台東区,75
東京都,64
世田谷区,49
藤沢市,47


### 年代別

In [24]:
output = pd.DataFrame(df.groupby("age").count()["id"])
output.index.name = "年代"
output = output.rename(columns = {"id": "投稿数"})
output.sort_values("投稿数", ascending=False).head(20)

Unnamed: 0_level_0,投稿数
年代,Unnamed: 1_level_1
20代前半,876
20代半ば,644
20代後半,508
30代前半,271
18-19歳,155
30代半ば,153
30代後半,137
40代前半,81
40代半ば,39
40代後半,35


### 時間帯別

In [27]:
output = pd.DataFrame(df.groupby(df.posted_at.map(lambda t: t.hour)).count()["id"])
output.index.name = "時間帯"
output = output.rename(columns = {"id": "投稿数"})
output.head(24)

Unnamed: 0_level_0,投稿数
時間帯,Unnamed: 1_level_1
0,224
1,13
2,12
3,17
4,8
5,15
6,7
7,16
8,12
9,24
