In [15]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [30]:
%cd "/content/drive/MyDrive/qiita-love/data_analysis"

/content/drive/MyDrive/qiita-love/data_analysis


In [1]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
import pickle

In [2]:
pd.set_option('display.max_columns', 100)
pd.set_option('display.max_rows', 100)

In [3]:
# 1月から12月までのデータを読み込んで結合しdfに格納
df = pd.DataFrame()
for i in range(1, 13):
    df_tmp = pd.read_csv(f"../data_collection/data/2020-{i:02}.csv", encoding="utf8")
    df = pd.concat([df, df_tmp], ignore_index=True)
df.head()

FileNotFoundError: [Errno 2] No such file or directory: '../data_collection/data/2020-09.csv'

In [20]:
# dfの各カラムのデータ型を確認
df.dtypes

likes_count        float64
title               object
body                object
created_date        object
created_time        object
tags                object
followers_count    float64
organization        object
items_count        float64
dtype: object

In [21]:
# dfの各カラムの欠損値の数を確認
df.isnull().sum()

likes_count             1
title                   0
body                    1
created_date            1
created_time            1
tags                    1
followers_count         2
organization       110525
items_count             2
dtype: int64

In [22]:
### 変更箇所 ###
# organizationに値が入っていればTrue、欠損していればFalseに置き換える、さらに数値に変換する
df["organization"] = df["organization"].notnull().astype(int)
df.head()

Unnamed: 0,likes_count,title,body,created_date,created_time,tags,followers_count,organization,items_count
0,1.0,品質担保に本気で取り組んでみている話,株式会社オズビジョンの @terra_yucco です。2020/01 現在、オズビジョン ...,2020-01-31,23:59:59,"['仕様', '品質管理', '品質', 'QCD']",43.0,1,97.0
1,5.0,【Unity】Screen.safeAreaとiOSステータスバーの罠（？）,# Unityで取得できる Screen.safeAreaには罠（？）がある罠に遭遇したのは...,2020-01-31,23:56:36,"['iOS', 'Unity', 'SafeArea']",2.0,0,7.0
2,3.0,徹底攻略！“SATySFiのロゴ”の出し方,**SATySFiのロゴ**といえば、もちろんコレですね。![image-1a.png](h...,2020-01-31,23:54:15,['SATySFi'],157.0,0,65.0
3,0.0,【KPI】家系図テーブルの操作 1【oracle】,某炎の紋章の家系図見てて、TRPGなどに使えるかなと思って家系図テーブルの運用を考える。階層...,2020-01-31,23:50:44,['oracle'],1.0,0,10.0
4,0.0,プログラミング・フォロを組み立てる,#プログラミング・フォロmicro:bitを内蔵して６本足で自律して歩き回ることの出来る [...,2020-01-31,23:45:47,"['RaspberryPi', 'microbit']",3.0,0,11.0


In [23]:
# dfの各カラムの欠損値の数を確認
df.isnull().sum()

likes_count        1
title              0
body               1
created_date       1
created_time       1
tags               1
followers_count    2
organization       0
items_count        2
dtype: int64

In [24]:
# 欠損値を含む行を削除
df = df.dropna()
# indexを振り直す
df = df.reset_index(drop=True)

In [25]:
# likes_countの値が1000以上の行を抽出
df[df["likes_count"] >= 1000].head()

Unnamed: 0,likes_count,title,body,created_date,created_time,tags,followers_count,organization,items_count
749,1386.0,Vue開発者のためのVue.jsベストプラクティス集15選,# はじめに**みなさん、Vue使ってますかー！**・・・・・・・（へんじがない。ただのしか...,2020-01-29,23:00:56,"['JavaScript', 'Vue.js', 'Nuxt']",131.0,0,15.0
1008,1523.0,Vue.jsで作成された、ちょっと面白くて役立ちそうなサービス,## [こちらに移行しました。(2020/05/16)](https://tech-blog...,2020-01-29,08:43:49,"['JavaScript', 'Bootstrap', 'ツール', 'Vue.js', '...",35.0,0,19.0
1459,1024.0,FFT（高速フーリエ変換）を完全に理解する話,"FFT(Fast Fourier Transform),高速フーリエ変換についての記事です。...",2020-01-27,22:00:06,"['アルゴリズム', 'math', 'AtCoder', '競技プログラミング']",163.0,0,28.0
3500,1062.0,2020年現在 Web系企業で採用されてる技術についてまとめてみた,# はじめに2020年も始まりましたね！タイトル通りですが、2020年現在スタートアップや大...,2020-01-22,10:43:23,"['初心者', 'ツール', 'まとめ', '初心者向け', 'プログラミング言語']",24.0,0,27.0
3868,4834.0,良いコードの書き方,# 概要チームによる継続的開発を前提としたコーディングのガイドライン。特定の言語を対象とした...,2020-01-21,10:50:39,"['Java', 'プログラミング', 'コーディング規約', 'チーム開発', 'Swift']",128.0,1,29.0


In [26]:
# likes_countの値が1000以上なら1000に置き換える
df.loc[df["likes_count"] >= 1000, "likes_count"] = 1000
df[df["likes_count"] >= 1000].head()

Unnamed: 0,likes_count,title,body,created_date,created_time,tags,followers_count,organization,items_count
749,1000.0,Vue開発者のためのVue.jsベストプラクティス集15選,# はじめに**みなさん、Vue使ってますかー！**・・・・・・・（へんじがない。ただのしか...,2020-01-29,23:00:56,"['JavaScript', 'Vue.js', 'Nuxt']",131.0,0,15.0
1008,1000.0,Vue.jsで作成された、ちょっと面白くて役立ちそうなサービス,## [こちらに移行しました。(2020/05/16)](https://tech-blog...,2020-01-29,08:43:49,"['JavaScript', 'Bootstrap', 'ツール', 'Vue.js', '...",35.0,0,19.0
1459,1000.0,FFT（高速フーリエ変換）を完全に理解する話,"FFT(Fast Fourier Transform),高速フーリエ変換についての記事です。...",2020-01-27,22:00:06,"['アルゴリズム', 'math', 'AtCoder', '競技プログラミング']",163.0,0,28.0
3500,1000.0,2020年現在 Web系企業で採用されてる技術についてまとめてみた,# はじめに2020年も始まりましたね！タイトル通りですが、2020年現在スタートアップや大...,2020-01-22,10:43:23,"['初心者', 'ツール', 'まとめ', '初心者向け', 'プログラミング言語']",24.0,0,27.0
3868,1000.0,良いコードの書き方,# 概要チームによる継続的開発を前提としたコーディングのガイドライン。特定の言語を対象とした...,2020-01-21,10:50:39,"['Java', 'プログラミング', 'コーディング規約', 'チーム開発', 'Swift']",128.0,1,29.0


In [27]:
# created_dateを今日から何日前に投稿されたかを表す数値に変換
df["created_date"] = pd.to_datetime(df["created_date"])
df["created_days_ago"] = (pd.to_datetime("today") - df["created_date"]).dt.days
df["created_month"] = df["created_date"].dt.month
df = df.drop("created_date", axis=1)
# created_timeを何時に投稿されたかに変換
df["created_time"] = df["created_time"].str[:2].astype(int)
df.head()

Unnamed: 0,likes_count,title,body,created_time,tags,followers_count,organization,items_count,created_days_ago,created_month
0,1.0,品質担保に本気で取り組んでみている話,株式会社オズビジョンの @terra_yucco です。2020/01 現在、オズビジョン ...,23,"['仕様', '品質管理', '品質', 'QCD']",43.0,1,97.0,1040,1
1,5.0,【Unity】Screen.safeAreaとiOSステータスバーの罠（？）,# Unityで取得できる Screen.safeAreaには罠（？）がある罠に遭遇したのは...,23,"['iOS', 'Unity', 'SafeArea']",2.0,0,7.0,1040,1
2,3.0,徹底攻略！“SATySFiのロゴ”の出し方,**SATySFiのロゴ**といえば、もちろんコレですね。![image-1a.png](h...,23,['SATySFi'],157.0,0,65.0,1040,1
3,0.0,【KPI】家系図テーブルの操作 1【oracle】,某炎の紋章の家系図見てて、TRPGなどに使えるかなと思って家系図テーブルの運用を考える。階層...,23,['oracle'],1.0,0,10.0,1040,1
4,0.0,プログラミング・フォロを組み立てる,#プログラミング・フォロmicro:bitを内蔵して６本足で自律して歩き回ることの出来る [...,23,"['RaspberryPi', 'microbit']",3.0,0,11.0,1040,1


In [35]:
# 文の情報をTF-IDFでベクトル化してからSVDで10次元に削減する関数
def vectorize_text(text, n_components=20):
    tfidf = TfidfVectorizer()
    df_tfidf = tfidf.fit_transform(df[text])
    file = '../app/model_tfidf/trained_model_' + text + '.pkl'
    pickle.dump(tfidf, open(file, 'wb'))

    svd = TruncatedSVD(n_components=n_components)
    df_svd = svd.fit_transform(df_tfidf)
    file = f'../app/model_svd/trained_model_' + text + '.pkl'
    pickle.dump(svd, open(file, 'wb'))

    df_ret = pd.DataFrame(df_svd, columns=[f"{text}_{i}" for i in range(n_components)])
    return df_ret

In [None]:
# 文の情報をTF-IDFでベクトル化してからSVDで10次元に削減する関数(2回目以降)
def vectorize_text(text, n_components=20):
    file = '../app/model_tfidf/trained_model_' + text + '.pkl'
    tfidf = pickle.load(open(file, 'rb'))
    df_tfidf = tfidf.transform(df[text])
    pickle.dump(tfidf, open(file, 'wb'))

    file = f'../app/model_svd/trained_model_' + text + '.pkl'
    svd = pickle.load(open(file, 'rb'))
    df_svd = svd.transform(df_tfidf)
    pickle.dump(svd, open(file, 'wb'))

    df_ret = pd.DataFrame(df_svd, columns=[f"{text}_{i}" for i in range(n_components)])
    return df_ret

In [36]:
# 指定のカラムをベクトル化して結合
for col, n_components in [("title", 20), ("body", 20), ("tags", 20)]:
    df_vec = vectorize_text(col, n_components)
    # ベクトル化したカラムを結合
    df = pd.concat([df, df_vec], axis=1)
    # 欠損値を含む行を削除
    df = df.dropna()
    # 元のカラムを削除
    df = df.drop(col, axis=1)
df.head()

Unnamed: 0,likes_count,created_time,followers_count,organization,items_count,created_days_ago,created_month,title_0,title_1,title_2,title_3,title_4,title_5,title_6,title_7,title_8,title_9,title_10,title_11,title_12,title_13,title_14,title_15,title_16,title_17,title_18,title_19,body_0,body_1,body_2,body_3,body_4,body_5,body_6,body_7,body_8,body_9,body_10,body_11,body_12,body_13,body_14,body_15,body_16,body_17,body_18,body_19,tags_0,tags_1,tags_2,tags_3,tags_4,tags_5,tags_6,tags_7,tags_8,tags_9,tags_10,tags_11,tags_12,tags_13,tags_14,tags_15,tags_16,tags_17,tags_18,tags_19
0,1.0,23,43.0,1,97.0,1040,1,-4.500403e-16,-1.272622e-14,8.290435e-14,-5.265333e-14,-4.472895e-14,4.40013e-14,2.964003e-14,2.400184e-13,-4.000405e-13,-1.618981e-13,2.944846e-13,3.651733e-13,9.342856e-13,2.177648e-14,1.186e-12,-7.561057e-13,-5.044475e-14,-2.154481e-12,5.203612e-14,-2.046689e-12,0.006063,9.050917e-07,0.00474,0.004435,-0.001534,0.002026,-0.011782,-0.002644,0.0041,-0.003432,0.018672,-0.006545,-0.003133,-0.008534,-0.009266,-0.004357,-0.001449,-0.004306,0.0046,-0.002695,1.368589e-05,-5.982908e-09,5e-06,-4.152438e-07,-5.830805e-07,-7.34693e-06,-1.226697e-07,-3.885491e-06,3.7e-05,-7.453559e-08,4e-06,-6.674596e-07,1.235866e-07,4e-06,3e-06,7e-06,4e-06,-3e-06,4e-06,3e-06
1,5.0,23,2.0,0,7.0,1040,1,0.0005157261,0.0008719502,0.000599429,0.001790704,0.003987319,-0.001195041,0.001229403,0.002457885,0.004648645,0.002567079,0.01697939,-0.003971771,0.005568064,0.05167523,0.01538894,-0.01747872,-0.07150699,-0.07751317,0.3230485,-0.06614566,0.034778,-0.002825193,0.007545,0.001325,-0.002589,0.005266,-0.004411,-0.014544,0.005039,0.017687,0.004082,-0.0056,0.012383,0.001789,0.008917,-0.000145,-0.00431,0.00565,5.2e-05,-0.004166,0.001201126,0.002715158,0.001893,0.006633753,-0.005441147,-0.004447897,0.007987663,0.03491092,0.010981,-0.003724159,0.11179,0.1485597,0.002813387,0.000618,-0.002093,0.000319,0.330533,0.027272,-0.136634,-2e-06
2,3.0,23,157.0,0,65.0,1040,1,1.985387e-06,9.942188e-07,3.635362e-06,5.121854e-06,7.352084e-06,6.192135e-06,1.924672e-05,2.473045e-05,1.166271e-05,2.761942e-06,0.0001583611,0.0003560817,-1.666184e-05,-2.614952e-05,4.400083e-06,-2.193658e-05,2.240486e-05,3.203007e-05,8.196343e-06,2.379514e-06,0.156661,-0.04337546,-0.023106,-0.028917,0.000793,-0.014235,0.026373,0.01116,0.00782,-0.005046,0.000739,-0.00407,-0.01767,0.026946,0.015278,-0.020146,0.009176,0.031891,0.014355,-0.001328,8.599358e-07,2.391348e-06,2e-06,8.640996e-07,-1.256343e-06,4.544364e-08,-5.950063e-07,-1.658783e-07,3e-06,2.620177e-07,1e-06,-1.386693e-06,6.862651e-06,-2e-06,-1e-06,9e-06,1.3e-05,-1e-05,2e-06,1.1e-05
3,0.0,23,1.0,0,10.0,1040,1,0.001153929,0.001424405,0.002888412,0.0006182741,0.005034951,0.002271755,0.003731945,0.003862946,0.01307373,0.02729795,0.04827855,-0.01835525,-0.004892313,-0.01099357,-0.0003432294,-0.008475488,0.007095404,-0.004172485,0.009785167,0.03241514,0.06584,-0.002708442,0.075463,-0.000431,-0.051652,-0.005452,-0.017909,0.002922,0.033791,0.075111,-0.032922,-0.037875,-0.024627,-0.081292,-0.102682,0.036988,-0.004385,0.027115,-0.00722,-0.114612,0.0003459076,0.0004550913,0.001348,0.002829116,-0.001855572,0.001471222,0.001124201,-9.308493e-05,0.000411,-0.0001102464,0.001032,-0.00143492,0.005135548,-0.001171,-0.000421,0.000666,0.000997,-0.003931,-0.004468,0.018866
4,0.0,23,3.0,0,11.0,1040,1,0.000108707,0.0001237601,0.0001992466,0.0007430572,0.0002407062,3.108348e-05,0.0002223759,0.0003543706,0.0004571518,0.0001027017,0.0009067908,0.0001242775,0.0005016158,0.002069098,0.0007613731,-0.0007709221,-0.001350447,1.404489e-06,-0.001592323,0.003283036,0.178239,-0.05287209,-0.061289,0.002612,-0.005922,-0.000824,-0.004359,0.000821,-0.002348,0.000506,-0.000293,0.002239,-0.000778,0.00938,0.019518,0.000206,-0.008454,0.011051,-0.006943,-0.009098,0.000903172,0.002821084,0.008449,0.004004773,-0.0006773948,0.003460808,-0.001404576,0.0004059176,0.001723,0.0007199191,0.000515,0.0006259154,0.000162568,0.001551,0.003443,0.000324,-0.00056,-0.002519,0.003965,0.003959


In [37]:
# trainとtestに分割する(likes_countを目的変数とする)
from sklearn.model_selection import train_test_split
train, test = train_test_split(df, test_size=0.3, random_state=0)
train_y = train["likes_count"]
train_X = train.drop(["likes_count"], axis=1)
test_y = test["likes_count"]
test_X = test.drop(["likes_count"], axis=1)

In [38]:
# Randomforestでモデルの学習
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
model.fit(train_X, train_y)

# テストデータの予測
y_pred = model.predict(test_X)

In [39]:
file = '../app/model/trained_rfr_model.pkl'
pickle.dump(model, open(file, 'wb'))

In [40]:
model_rfr = pickle.load(open(file, 'rb'))
model_rfr.predict(test_X.iloc[[0]])

array([4.4])