In [1]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')
from util import *

In [2]:
data = pd.read_csv('../../input/data.csv')

In [3]:
# 数据集5：news_info.csv
# 包含数据集7和8中涉及到的企业的新闻舆情信息，每一行代表一个企业新闻舆情，每一行3列，其中id列为企业唯一标识，列之间采用“,”分隔符分割。
# 数据格式如下：
# [id:企业唯一标识, positive_negtive:新闻正负面性, public_date:发布日期]
news_info = pd.read_csv('../../input/train/news_info.csv')
print(news_info.shape)
news_info.info()

(10518, 3)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10518 entries, 0 to 10517
Data columns (total 3 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                10518 non-null  object
 1   positive_negtive  10518 non-null  object
 2   public_date       10518 non-null  object
dtypes: object(3)
memory usage: 246.6+ KB


In [4]:
news_info.head()

Unnamed: 0,id,positive_negtive,public_date
0,f000950527a6feb62669d6a175fe6fdccd1eb4f7ca8e5016,积极,2016-12-30
1,f000950527a6feb6e8bd9919e2ca363359bcfa997a0f9de7,中立,2017-08-09
2,f000950527a6feb6e8bd9919e2ca363359bcfa997a0f9de7,消极,2016-02-29
3,d8071a739aa75a3bcf6fb0041ee883243251d30025ab9d45,中立,2018-06-08
4,f000950527a6feb6d71de3382afa0bc5ff87bb65477f698a,积极,2015-06-29


In [5]:
news_info['positive_negtive'].unique()

array(['积极', '中立', '消极'], dtype=object)

In [6]:
news_info['positive_negtive'] = news_info['positive_negtive'].map({'积极': 1, '中立': 0, '消极': -1})

In [7]:
def identify_missing(df, missing_threshold):
    """
    缺失率
    @param df:
    @param missing_threshold:
    @return:
    """
    missing_rate = df.isnull().sum() / len(df)
    missing_rate = missing_rate.sort_values(ascending=False)
    print(missing_rate)
    to_drop = missing_rate[missing_rate > missing_threshold].index.to_list()
    print('{} features with greater than {} missing values.\n'.format(len(to_drop), missing_threshold))
    return to_drop

In [8]:
to_drop = identify_missing(news_info, missing_threshold=0.5)
news_info.drop(to_drop, axis=1, inplace=True)
to_drop

public_date         0.0
positive_negtive    0.0
id                  0.0
dtype: float64
0 features with greater than 0.5 missing values.



[]

In [9]:
news_info['id'].nunique()

927

In [10]:
df = pd.DataFrame(news_info['id'].unique())
df.columns = ['id']
df.shape

(927, 1)

In [11]:
tmp = news_info.groupby('id', as_index=False)['positive_negtive'].agg({
    'positive_negtive_mean': 'mean',
    'positive_negtive_median': 'median',
    'positive_negtive_sum': 'sum',
    'positive_negtive_count': 'count',
    'positive_negtive_max': 'max',
    'positive_negtive_min': 'min',
    'positive_negtive_nunique': 'nunique',
})

df = df.merge(tmp, on='id', how='left')

In [12]:
# tmp['positive_negtive_max'].unique(), tmp['positive_negtive_min'].unique(), tmp['positive_negtive_median'].unique()

In [13]:
news_info['tmp'] = 1
tmp = news_info.pivot_table(values='tmp', index='id', columns='positive_negtive', aggfunc=np.sum).fillna(0)
tmp.columns = ['positive_negtive_{}'.format(f) for f in tmp.columns]
tmp.reset_index(inplace=True)

df = df.merge(tmp, on='id', how='left')

In [14]:
# df['rate_-1'] = df['positive_negtive_-1'] / df['positive_negtive_count']
df['rate_0'] = df['positive_negtive_0'] / df['positive_negtive_count']
df['rate_1'] = df['positive_negtive_1'] / df['positive_negtive_count']

df['-1_1'] = df['positive_negtive_-1'] / df['positive_negtive_1']
df['-1_0'] = df['positive_negtive_-1'] / df['positive_negtive_0']

In [15]:
df.to_csv('../../input/news_info.csv', index=False)

In [16]:
df.corr()

Unnamed: 0,positive_negtive_mean,positive_negtive_median,positive_negtive_sum,positive_negtive_count,positive_negtive_max,positive_negtive_min,positive_negtive_nunique,positive_negtive_-1,positive_negtive_0,positive_negtive_1,rate_0,rate_1,-1_1,-1_0
positive_negtive_mean,1.0,0.963434,0.128465,0.016699,0.847837,0.768828,-0.095085,-0.187449,-0.046453,0.09036,-0.522647,0.914908,-0.589384,-0.431417
positive_negtive_median,0.963434,1.0,0.167701,0.058946,0.812398,0.672095,-0.024226,-0.157289,-0.012068,0.132982,-0.514904,0.886836,-0.517088,-0.37661
positive_negtive_sum,0.128465,0.167701,1.0,0.896448,0.135636,-0.095845,0.21663,0.18284,0.725576,0.985618,-0.071981,0.119825,-0.08205,-0.062742
positive_negtive_count,0.016699,0.058946,0.896448,1.0,0.121479,-0.208252,0.324135,0.562068,0.940853,0.951994,-0.015382,0.018429,0.081147,0.011592
positive_negtive_max,0.847837,0.812398,0.135636,0.121479,1.0,0.430054,0.371103,0.014825,0.103375,0.131971,-0.474793,0.79069,,-0.058375
positive_negtive_min,0.768828,0.672095,-0.095845,-0.208252,0.430054,1.0,-0.665762,-0.288108,-0.238584,-0.140976,-0.354741,0.681113,-0.394437,-0.489021
positive_negtive_nunique,-0.095085,-0.024226,0.21663,0.324135,0.371103,-0.665762,1.0,0.315632,0.342719,0.26096,-0.003635,-0.061744,0.387258,0.263417
positive_negtive_-1,-0.187449,-0.157289,0.18284,0.562068,0.014825,-0.288108,0.315632,1.0,0.63768,0.346353,-0.009227,-0.120743,0.368631,0.240706
positive_negtive_0,-0.046453,-0.012068,0.725576,0.940853,0.103375,-0.238584,0.342719,0.63768,1.0,0.801947,0.07079,-0.064522,0.143393,-0.001959
positive_negtive_1,0.09036,0.132982,0.985618,0.951994,0.131971,-0.140976,0.26096,0.346353,0.801947,1.0,-0.07027,0.093581,-0.018481,-0.019669


In [17]:
df.to_csv('../../input/news_info.csv', index=False)

In [18]:
data = data.merge(df, how='left', on='id')

In [19]:
# data['positive_negtive'].fillna(data['positive_negtive'].mean(), inplace=True)

In [20]:
train = data[data['label'].notnull()]
test = data[data['label'].isnull()]
train.shape, test.shape

used_cols = [i for i in train.columns if i not in ['id', 'label']]
y = train['label']
train = train[used_cols]
test = test[used_cols]

X_train, X_valid, y_train, y_valid = train_test_split(train, y, test_size=0.25, random_state=2020)

cols = X_train.columns
useful_dict, useless_dict, useful_cols, useless_cols = auc_select(X_train, y_train, X_valid, y_valid, cols, threshold=0.52)
print('AUC useless_cols: \n', useless_cols)

dtrain = lgb.Dataset(X_train, y_train)
dvalid = lgb.Dataset(X_valid, y_valid, reference=dtrain)

params = {
    'objective': 'binary',
    'boosting': 'gbdt',
    'metric': 'auc',
#     'metric': 'None',  # 用自定义评估函数是将metric设置为'None'
    'learning_rate': 0.1,
    'num_leaves': 31,
    'lambda_l1': 0,
    'lambda_l2': 1,
    'num_threads': 23,
    'min_data_in_leaf': 20,
    'first_metric_only': True,
    'is_unbalance': True,
    'max_depth': -1,
    'seed': 2020
}

valid_model = lgb.train(
    params,
    dtrain,
    valid_sets=[dtrain, dvalid],
    early_stopping_rounds=50,
    verbose_eval=300 
)



positive_negtive_mean
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[42]	training's auc: 0.568555	valid_0's auc: 0.580369
Evaluated only: auc
**********
0.5803694070888776
positive_negtive_median
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[48]	training's auc: 0.568313	valid_0's auc: 0.580381
Evaluated only: auc
**********
0.5803806176924565
positive_negtive_sum
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[6]	training's auc: 0.568462	valid_0's auc: 0.580358
Evaluated only: auc
**********
0.5803576064535314
positive_negtive_count
Training until validation scores don't improve for 50 rounds
Early stopping, best iteration is:
[18]	training's auc: 0.568496	valid_0's auc: 0.580506
Evaluated only: auc
**********
0.5805057044271263
positive_negtive_max
Training until validation scores don't improve for 50 rounds
Early stopping, best iteratio

In [21]:
importance = valid_model.feature_importance(importance_type='gain')
feature_name = valid_model.feature_name()

df_importance = pd.DataFrame({
    'feature_name': feature_name,
    'importance': importance
}).sort_values(by='importance', ascending=False)
df_importance['normalized_importance'] = df_importance['importance'] / df_importance['importance'].sum()
df_importance['cumulative_importance'] = np.cumsum(df_importance['normalized_importance'])
df_importance

Unnamed: 0,feature_name,importance,normalized_importance,cumulative_importance
7,positive_negtive_-1,5833.171829,0.967642,0.967642
12,-1_1,159.559006,0.026469,0.994111
5,positive_negtive_min,20.01546,0.00332,0.997431
11,rate_1,5.939357,0.000985,0.998417
3,positive_negtive_count,3.95931,0.000657,0.999073
6,positive_negtive_nunique,3.4353,0.00057,0.999643
8,positive_negtive_0,1.599142,0.000265,0.999909
13,-1_0,0.55149,9.1e-05,1.0
0,positive_negtive_mean,0.0,0.0,1.0
1,positive_negtive_median,0.0,0.0,1.0


In [22]:
record_low_importance = df_importance[df_importance['importance'] == 0.0]
to_drop = list(record_low_importance['feature_name'])
print(to_drop)

df.drop(to_drop, axis=1, inplace=True)
df.to_csv('../../input/change_info.csv', index=False)

['positive_negtive_mean', 'positive_negtive_median', 'positive_negtive_sum', 'positive_negtive_max', 'positive_negtive_1', 'rate_0']
