In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')
from util import *

In [2]:
data = pd.read_csv('../../input/data.csv')

In [3]:
data.isnull().sum()

id           0
label    10000
dtype: int64

In [4]:
# 数据集5：news_info.csv
# 包含数据集7和8中涉及到的企业的新闻舆情信息，每一行代表一个企业新闻舆情，每一行3列，其中id列为企业唯一标识，列之间采用“,”分隔符分割。
# 数据格式如下：
# [id:企业唯一标识, positive_negtive:新闻正负面性, public_date:发布日期]
news_info = pd.read_csv('../../input/train/news_info.csv')
print(news_info.shape)
news_info.info()

(10518, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10518 entries, 0 to 10517
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                10518 non-null  object
 1   positive_negtive  10518 non-null  object
 2   public_date       10518 non-null  object
dtypes: object(3)
memory usage: 246.6+ KB


In [5]:
news_info.head()

Unnamed: 0,id,positive_negtive,public_date
0,f000950527a6feb62669d6a175fe6fdccd1eb4f7ca8e5016,积极,2016-12-30
1,f000950527a6feb6e8bd9919e2ca363359bcfa997a0f9de7,中立,2017-08-09
2,f000950527a6feb6e8bd9919e2ca363359bcfa997a0f9de7,消极,2016-02-29
3,d8071a739aa75a3bcf6fb0041ee883243251d30025ab9d45,中立,2018-06-08
4,f000950527a6feb6d71de3382afa0bc5ff87bb65477f698a,积极,2015-06-29


In [6]:
news_info['positive_negtive'].unique()

array(['积极', '中立', '消极'], dtype=object)

In [7]:
news_info['positive_negtive'] = news_info['positive_negtive'].map({'积极': 1, '中立': 0, '消极': -1})

In [8]:
def identify_missing(df, missing_threshold):
    """
    缺失率
    @param df:
    @param missing_threshold:
    @return:
    """
    missing_rate = df.isnull().sum() / len(df)
    missing_rate = missing_rate.sort_values(ascending=False)
    print(missing_rate)
    to_drop = missing_rate[missing_rate > missing_threshold].index.to_list()
    print('{} features with greater than {} missing values.\n'.format(len(to_drop), missing_threshold))
    return to_drop

In [9]:
to_drop = identify_missing(news_info, missing_threshold=0.5)
news_info.drop(to_drop, axis=1, inplace=True)
to_drop

public_date         0.0
positive_negtive    0.0
id                  0.0
dtype: float64
0 features with greater than 0.5 missing values.



[]

In [10]:
news_info['id'].nunique()

927

In [11]:
df = pd.DataFrame(news_info['id'].unique())
df.columns = ['id']
df.shape

(927, 1)

In [12]:
tmp = news_info.groupby('id', as_index=False)['positive_negtive'].agg({
#     'positive_negtive_mean': 'mean',
#     'positive_negtive_median': 'median',
#     'positive_negtive_sum': 'sum',
    'positive_negtive_count': 'count',
#     'positive_negtive_max': 'max',
#     'positive_negtive_min': 'min',
    'positive_negtive_nunique': 'nunique',
})

df = df.merge(tmp, on='id', how='left')

In [13]:
# tmp['positive_negtive_max'].unique(), tmp['positive_negtive_min'].unique(), tmp['positive_negtive_median'].unique()

In [14]:
news_info['tmp'] = 1
tmp = news_info.pivot_table(values='tmp', index='id', columns='positive_negtive', aggfunc=np.sum).fillna(0)
tmp.columns = ['positive_negtive_{}'.format(f) for f in tmp.columns]
tmp.reset_index(inplace=True)

df = df.merge(tmp, on='id', how='left')

In [15]:
# df['rate_-1'] = df['positive_negtive_-1'] / df['positive_negtive_count']
df['rate_0'] = df['positive_negtive_0'] / df['positive_negtive_count']
df['rate_1'] = df['positive_negtive_1'] / df['positive_negtive_count']

df['-1_1'] = df['positive_negtive_-1'] / df['positive_negtive_1']
df['-1_0'] = df['positive_negtive_-1'] / df['positive_negtive_0']

In [16]:
df.to_csv('../../input/news_info.csv', index=False)

In [17]:
df.corr()

Unnamed: 0,positive_negtive_count,positive_negtive_nunique,positive_negtive_-1,positive_negtive_0,positive_negtive_1,rate_0,rate_1,-1_1,-1_0
positive_negtive_count,1.0,0.324135,0.562068,0.940853,0.951994,-0.015382,0.018429,0.081147,0.011592
positive_negtive_nunique,0.324135,1.0,0.315632,0.342719,0.26096,-0.003635,-0.061744,0.387258,0.263417
positive_negtive_-1,0.562068,0.315632,1.0,0.63768,0.346353,-0.009227,-0.120743,0.368631,0.240706
positive_negtive_0,0.940853,0.342719,0.63768,1.0,0.801947,0.07079,-0.064522,0.143393,-0.001959
positive_negtive_1,0.951994,0.26096,0.346353,0.801947,1.0,-0.07027,0.093581,-0.018481,-0.019669
rate_0,-0.015382,-0.003635,-0.009227,0.07079,-0.07027,1.0,-0.822316,0.269479,-0.29954
rate_1,0.018429,-0.061744,-0.120743,-0.064522,0.093581,-0.822316,1.0,-0.478867,-0.110755
-1_1,0.081147,0.387258,0.368631,0.143393,-0.018481,0.269479,-0.478867,1.0,0.512606
-1_0,0.011592,0.263417,0.240706,-0.001959,-0.019669,-0.29954,-0.110755,0.512606,1.0


In [18]:
df.to_csv('news_info.csv', index=False)

In [19]:
data = data.merge(df, how='left', on='id')

In [20]:
# data['positive_negtive'].fillna(data['positive_negtive'].mean(), inplace=True)

In [21]:
train = data[data['label'].notnull()]
test = data[data['label'].isnull()]
train.shape, test.shape

((14865, 11), (10000, 11))

In [22]:
# tmp = train[train['positive_negtive_mean'].notnull()]
# tmp['bins'] = pd.cut(tmp['positive_negtive_mean'], 8)
# tmp.groupby('bins')['label'].agg(['count', 'mean'])

In [23]:
train.head()

Unnamed: 0,id,label,positive_negtive_count,positive_negtive_nunique,positive_negtive_-1,positive_negtive_0,positive_negtive_1,rate_0,rate_1,-1_1,-1_0
0,59b38c56de3836831ff90a77d892a13523b7494f6ed09ff7,1.0,,,,,,,,,
1,da8691b210adb3f6be8064e006f220070565db287275ad38,0.0,,,,,,,,,
2,82750f1b9d122350918121f97c99bf96e11aa24ee91504a9,0.0,,,,,,,,,
3,f000950527a6feb6b2c6de6f85c1e7438ba5590be931e2ec,0.0,,,,,,,,,
4,f1c1045b13d1832927e3743e49d2917f2d98424f0849a373,0.0,,,,,,,,,


In [24]:
# train['positive_negtive_mean'].notnull().sum(), test['positive_negtive_mean'].notnull().sum()

In [25]:
used_cols = [i for i in train.columns if i not in ['id', 'label']]
y = train['label']
train = train[used_cols]
test = test[used_cols]

In [26]:
X_train, X_valid, y_train, y_valid = train_test_split(train, y, test_size=0.25, random_state=2020)

In [27]:
cols = X_train.columns
useful_dict, useless_dict, useful_cols, useless_cols = auc_select(X_train, y_train, X_valid, y_valid, cols, threshold=0.52)
print('AUC useless_cols: \n', useless_cols)

positive_negtive_count
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[18]	training's auc: 0.568496	valid_0's auc: 0.580506
Evaluated only: auc
**********
0.5805057044271263
positive_negtive_nunique
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[36]	training's auc: 0.568482	valid_0's auc: 0.580411
Evaluated only: auc
**********
0.5804112993443566
positive_negtive_-1
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[52]	training's auc: 0.568508	valid_0's auc: 0.580411
Evaluated only: auc
**********
0.5804112993443566
positive_negtive_0
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[45]	training's auc: 0.568475	valid_0's auc: 0.580564
Evaluated only: auc
**********
0.5805635275403228
positive_negtive_1
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is

In [28]:
dtrain = lgb.Dataset(X_train, y_train)
dvalid = lgb.Dataset(X_valid, y_valid, reference=dtrain)

In [29]:
params = {
    'objective': 'binary',
    'boosting': 'gbdt',
    'metric': 'auc',
#     'metric': 'None',  # 用自定义评估函数是将metric设置为'None'
    'learning_rate': 0.1,
    'num_leaves': 31,
    'lambda_l1': 0,
    'lambda_l2': 1,
    'num_threads': 23,
    'min_data_in_leaf': 20,
    'first_metric_only': True,
    'is_unbalance': True,
    'max_depth': -1,
    'seed': 2020
}

valid_model = lgb.train(
    params,
    dtrain,
    valid_sets=[dtrain, dvalid],
    early_stopping_rounds=50,
    verbose_eval=300 
)

Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[34]	training's auc: 0.568783	valid_1's auc: 0.580468
Evaluated only: auc


In [30]:
importance = valid_model.feature_importance(importance_type='gain')
feature_name = valid_model.feature_name()

df_importance = pd.DataFrame({
    'feature_name': feature_name,
    'importance': importance
}).sort_values(by='importance', ascending=False)
df_importance['normalized_importance'] = df_importance['importance'] / df_importance['importance'].sum()
df_importance['cumulative_importance'] = np.cumsum(df_importance['normalized_importance'])
df_importance

Unnamed: 0,feature_name,importance,normalized_importance,cumulative_importance
2,positive_negtive_-1,7129.39766,0.92541,0.92541
7,-1_1,194.428056,0.025237,0.950647
0,positive_negtive_count,160.384492,0.020818,0.971465
6,rate_1,66.954082,0.008691,0.980156
4,positive_negtive_1,54.558284,0.007082,0.987238
3,positive_negtive_0,47.540589,0.006171,0.993409
8,-1_0,21.579137,0.002801,0.99621
1,positive_negtive_nunique,16.319535,0.002118,0.998328
5,rate_0,12.881613,0.001672,1.0
