In [118]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [119]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
import lightgbm as lgb
pd.set_option('display.max_columns', 1000000)
pd.set_option('display.max_rows', 10000000)

In [100]:
# データの読み込み
data_1 = pd.read_csv('/content/drive/MyDrive/qiita-love/data/2020-01.csv')

In [103]:
data_1.head(1)

Unnamed: 0,likes_count,followers_count,items_count,string
0,1,43.0,97.0,品質担保に本気で取り組んでみている話株式会社オズビジョンの @terra_yucco です。...


In [91]:
data_1.columns

Index(['likes_count', 'title', 'body', 'created_date', 'created_time', 'tags',
       'followers_count', 'organization', 'items_count'],
      dtype='object')

In [101]:
# いいね数を数値に変換する
for i in range(len(data_1)):
    if str.isdecimal(data_1['likes_count'][i]):
        data_1['likes_count'][i] = int(data_1['likes_count'][i])
    else:
        data_1['likes_count'][i] = 0

In [102]:
# 欠損値を補完する
# 文字列のデータを全て繋げる
data_1['followers_count'] = data_1['followers_count'].fillna(0)
data_1['items_count'] = data_1['items_count'].fillna(0)

data_1 = data_1.fillna('none')

data_1['string'] = data_1['title'] + data_1['body'] + data_1['created_date'] + data_1['created_time'] + data_1['tags'] + data_1['organization']
data_1 = data_1.drop(['title', 'body', 'created_date', 'created_time', 'tags', 'organization'], axis=1)

In [104]:
# TF-IDFを用いたテキストデータの数値化
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
sentence = vectorizer.fit_transform(data_1["string"])

In [105]:
# 次元削減
from sklearn.decomposition import TruncatedSVD

transformer = TruncatedSVD(n_components=8)
feature = transformer.fit_transform(sentence)

In [106]:
# 元のデータフレームに結合
name = 'tfidf_sdv_'
num = feature.shape[1]
text_df = pd.DataFrame(feature, columns=[(name + str(i)) for i in range(num)])

data_1 = pd.concat([data_1, text_df], axis=1)
data_1 = data_1.drop(['string'], axis=1)

In [107]:
# 訓練データとテストデータに分割する
train, test = train_test_split(data_1, test_size=0.3)
train_y = train['likes_count']
train_X = train.drop(['likes_count'], axis=1)
test_y = test['likes_count']
test_X = test.drop(['likes_count'], axis=1)

In [109]:
# モデルの学習
model = lgb.LGBMRegressor() # モデルのインスタンスの作成
model.fit(train_X, train_y) # モデルの学習

# テストデータの予測
y_pred = model.predict(test_X)

In [120]:
# 平均絶対誤差を計算
from sklearn.metrics import mean_absolute_error
mean_absolute_error(test_y, y_pred)

18.492931200687117

In [121]:
pd.DataFrame({'true': test_y, 'pred': y_pred})

Unnamed: 0,true,pred
2613,1,-1.592678
481,6,4.04691
3254,0,4.154653
6520,12,10.202632
3445,6,4.901983
7090,2,0.75659
9377,1,4.627177
4345,0,2.397505
9077,5,81.626227
331,1,1.029917
