# ワクワクメールのデータ分析

In [1]:
import pandas as pd
import datetime
import seaborn as sns
import japanize_matplotlib

In [2]:
TOKYO_FILE_PATH = './../rawdata/wakuwaku_tokyo.csv'
KANAGAWA_FILE_PATH = './../rawdata/wakuwaku_kanagawa.csv'

tokyo_df = pd.read_csv(TOKYO_FILE_PATH)
kanagawa_df = pd.read_csv(KANAGAWA_FILE_PATH)

In [3]:
(len(tokyo_df), len(kanagawa_df))

(704, 309)

In [4]:
df_master = pd.merge(tokyo_df, kanagawa_df, how="outer")
df = df_master.drop(["image_url", "url"], axis=1)

In [5]:
print(len(df))
# df.head()

1013


## データ分析

### 地域別

In [6]:
output = pd.DataFrame(df.groupby("city").count()["id"])
output.index.name = "地域"
output = output.rename(columns = {"id": "投稿数"})
output.sort_values("投稿数", ascending=False).head(10)

Unnamed: 0_level_0,投稿数
地域,Unnamed: 1_level_1
新宿区,177
豊島区,132
横浜市西区,58
渋谷区,33
台東区,27
横浜市神奈川区,26
世田谷区,23
横浜市中区,22
足立区,21
藤沢市,20


### 年代別

In [7]:
output = pd.DataFrame(df.groupby("age").count()["id"])
output.index.name = "年代"
output = output.rename(columns = {"id": "投稿数"})
output.sort_values("投稿数", ascending=False).head(20)

Unnamed: 0_level_0,投稿数
年代,Unnamed: 1_level_1
20代前半,301
20代半ば,167
20代後半,158
30代前半,118
18-19歳,80
30代後半,52
30代半ば,47
40代前半,36
40代半ば,20
40代後半,12


### 時間帯別

In [8]:
output = df.assign(datetime=df.apply(lambda x: pd.to_datetime(x.posted_at), axis=1))
output = pd.DataFrame(output.groupby(output.datetime.map(lambda t: t.hour)).count()["id"])
output.index.name = "時間帯"
output = output.rename(columns = {"id": "投稿数"})
output.head(24)

Unnamed: 0_level_0,投稿数
時間帯,Unnamed: 1_level_1
0,14
1,7
2,5
3,10
4,8
5,6
6,2
7,8
8,5
9,14


### カテゴリ別

In [9]:
output = pd.DataFrame(df.groupby("genre").count()["id"])
output.index.name = "カテゴリ"
output = output.rename(columns = {"id": "投稿数"})
output.sort_values("投稿数", ascending=False).head(7)

Unnamed: 0_level_0,投稿数
カテゴリ,Unnamed: 1_level_1
すぐ会いたい,471
アダルト・H,180
今日じゃないけど...,137
大人の恋人候補,95
アブノーマル,74
ミドルエイジ,56


## 検索

### キーワード検索

In [10]:
#keyword = "会いたい"
#df[df['title'].str.contains(keyword)]

### 地域検索

In [11]:
area = "川崎市中原区"
# df[df['city'].str.contains(area)]

## サクラ判定

In [12]:
eval_df = df_master

### 年齢によるフィルター

In [13]:
old_age_list = ["40代前半", "40代半ば", "40代後半", "50代前半", "50代半ば", "60代前半", "50代後半"]
eval_df = eval_df[eval_df.age.str[:2].astype(int) < 40]

### NGワードによるフィルター

In [14]:
ng_words = ["ニューハーフ", "下着", "生", "ftm", "MTF", "ビッチ", "お願い", 
            "イラマ", "オネエ", "痴漢", "男の娘", "車", "バイ",
            "おなべ", "ハメ撮り", "はめ撮り", "初体験", "処女", "童貞"]

def proc(x):
    for ng_word in ng_words:
        if ng_word in x:
            return True
    return False

mask = eval_df["title"].map(proc).map(lambda x: not x)
eval_df = eval_df[mask]

### 長すぎる名前のフィルター

In [15]:
def proc(x):
    if type(x) == str:
        return len(x) < 10
    else:
        return True

mask = eval_df["name"].map(proc)
eval_df = eval_df[mask]

### タイトルの出現頻度
タイトルの出現頻度による評価。頻度もさらに2回の頻度、3回以上の頻度で細分化できる。

In [16]:
duplicated_title_df = pd.DataFrame(eval_df.groupby("title").count())
output = duplicated_title_df.sort_values("id", ascending=False)

In [17]:
output = pd.DataFrame(output["id"])
output.index.name = "title"
output = output.rename(columns = {"id": "title_freq"})

In [18]:
eval_df = pd.merge(eval_df, output, on="title", how="left")

### プロフィールの出現頻度

In [19]:
# eval_df["name_age_city"] = eval_df["name"] + "_" + eval_df["age"] + "_" + eval_df["city"]
eval_df["name_age_city"] = eval_df["name"] + "_" + eval_df["age"]
output = pd.DataFrame(eval_df.groupby("name_age_city").count())

In [20]:
output = pd.DataFrame(output["id"])
output.index.name = "name_age_city"
output = output.rename(columns = {"id": "profile_freq"})

In [21]:
output[output.profile_freq == 1]

Unnamed: 0_level_0,profile_freq
name_age_city,Unnamed: 1_level_1
**まいたん**_20代前半,1
AK_18-19歳,1
K_20代前半,1
L_20代前半,1
N_20代半ば,1
...,...
離れちっち_20代後半,1
雨宮_20代半ば,1
雪_20代前半,1
風花_20代後半,1


In [22]:
eval_df = pd.merge(eval_df, output, on="name_age_city", how="left")

## 結果

In [23]:
result = eval_df
result = result.assign(datetime=result.apply(lambda x: pd.to_datetime(x.posted_at), axis=1))
result = result.sort_values("datetime")

one_day_ago = datetime.datetime.now() - datetime.timedelta(days=1)
result = result[result.datetime > one_day_ago]

result = result[(result.profile_freq == 1) & (result.title_freq == 1) ]

In [24]:
out = result.drop(["url", "image_url", "title_freq", "profile_freq", "name_age_city", "datetime"], axis=1)

In [25]:
len(out)

249

In [26]:
out_file_name = "wakuwaku_out.csv"
out.to_csv("./../data/" + out_file_name)