In [1]:
# coding: utf-8

import os
import numpy as np
import pandas as pd
from sklearn.datasets import dump_svmlight_file


## step 0: back up data

In [2]:
if not os.path.isdir('tmp'): os.mkdir('tmp')
os.system('cp regression.train tmp/')
os.system('cp regression.test tmp/')

def read_data():
	train = pd.read_csv('tmp/regression.train', sep='\t', header=None)
	test = pd.read_csv('tmp/regression.test', sep='\t', header=None)

	ytrain, ytest = train[0], test[0]
	Xtrain, Xtest = train.drop(0, axis=1), test.drop(0, axis=1)

	return Xtrain, ytrain, Xtest, ytest

def trainAndPredict():
	os.system('../../lightgbm config=train.conf > /dev/null')
	os.system('../../lightgbm config=predict.conf > /dev/null')
    # print first 10 lines of output file
	print pd.read_csv('LightGBM_predict_result.txt', header=None, names=['output'])['output'].head(10)

## step 1 : train & predict with example data and configurations

In [3]:
trainAndPredict()

0    0.700685
1    0.417371
2    0.141863
3    0.501358
4    0.229417
5    0.181165
6    0.266972
7    0.311503
8    0.781225
9    0.412591
Name: output, dtype: float64


## step 2 : transform data into svmlight format and train & predict

In [4]:
Xtrain, ytrain, Xtest, ytest = read_data()

dump_svmlight_file(Xtrain, ytrain, 'regression.train')
dump_svmlight_file(Xtest, ytest, 'regression.test')

trainAndPredict()

0    0.716717
1    0.435631
2    0.153838
3    0.489473
4    0.167634
5    0.140674
6    0.306171
7    0.368907
8    0.785962
9    0.436042
Name: output, dtype: float64


## step 3 : use a different way to transform into svmlight format

- first line of regression.train is transformed from 1	0.869	-0.635	0.226	0.327	-0.690	0.754 ... into 1	0:0.869	1:-0.635	2:0.226	3:0.327	4:-0.690	5:0.754 ...

- remove the terms which value = 0 and save as tsv 

In [5]:
Xtrain, ytrain, Xtest, ytest = read_data()

for i in range(1, 29):
	Xtrain[i] = Xtrain[i].map(lambda x: str(i)+':'+str(x) if x else '')
	Xtest[i] = Xtest[i].map(lambda x: str(i)+':'+str(x) if x else '')

pd.concat([ytrain, Xtrain], axis=1).to_csv('regression.train', sep='\t', index=False, header=False)
pd.concat([ytest, Xtest], axis=1).to_csv('regression.test', sep='\t', index=False, header=False)

trainAndPredict()

0    0.716717
1    0.435631
2    0.153838
3    0.489473
4    0.167634
5    0.140674
6    0.306171
7    0.368907
8    0.785962
9    0.436042
Name: output, dtype: float64


## step 4: similar to step 3, but keep 0-terms while transform 

In [6]:
Xtrain, ytrain, Xtest, ytest = read_data()

for i in range(1, 29):
	Xtrain[i] = Xtrain[i].map(lambda x: str(i)+':'+str(x))
	Xtest[i] = Xtest[i].map(lambda x: str(i)+':'+str(x))

pd.concat([ytrain, Xtrain], axis=1).to_csv('regression.train', sep='\t', index=False, header=False)
pd.concat([ytest, Xtest], axis=1).to_csv('regression.test', sep='\t', index=False, header=False)

trainAndPredict()

0    0.700684
1    0.417376
2    0.141863
3    0.501357
4    0.229417
5    0.181165
6    0.266972
7    0.311502
8    0.781224
9    0.412590
Name: output, dtype: float64


## final step: clean up

In [7]:
os.system('cp tmp/regression.train .')
os.system('cp tmp/regression.test .')
os.system('rm -rf tmp')

0