导入包

In [1]:
import java.io.File
import com.lignting.neural.*
import org.jetbrains.kotlinx.multik.api.mk
import org.jetbrains.kotlinx.multik.api.ndarray
import org.jetbrains.kotlinx.multik.ndarray.operations.toList
import org.jetbrains.kotlinx.multik.ndarray.data.get
import com.github.doyaaaaaken.kotlincsv.dsl.csvWriter
import com.github.doyaaaaaken.kotlincsv.dsl.csvReader

数据地址

In [2]:
val trainPath = "res/used_car_train_20200313.csv"
val testPath = "res/used_car_testA_20200313.csv"

使用kotlin-csv-jvm库导入csv数据

In [3]:
fun getDataFrame(path:String): AnyFrame {
    val dataMap = mutableMapOf<String, MutableList<Double?>>()
    csvReader {
        delimiter = ' '
    }.readAllWithHeader(File(path)).forEach {
        it.forEach { entry ->
            if (dataMap[entry.key] == null)
                dataMap[entry.key] = mutableListOf()
            dataMap[entry.key]!!.add(entry.value.toDoubleOrNull())
        }
    }

    return dataMap.toDataFrame()
}

In [4]:
var trainDf = getDataFrame(trainPath)
var targetDf = getDataFrame(testPath)

检查各行空值数量并使用-1填充空值

In [5]:
trainDf.describe().select { name and nulls }

name,nulls
SaleID,0
name,0
regDate,0
model,1
brand,0
bodyType,4506
fuelType,8680
gearbox,5981
power,0
kilometer,0


In [6]:
trainDf = trainDf.fillNulls{ model and bodyType and fuelType and gearbox and notRepairedDamage }.with { -1.0 }
targetDf = targetDf.fillNulls{ model and bodyType and fuelType and gearbox and notRepairedDamage }.with { -1.0 }

In [7]:
trainDf.describe().select { name and nulls }

name,nulls
SaleID,0
name,0
regDate,0
model,0
brand,0
bodyType,0
fuelType,0
gearbox,0
power,0
kilometer,0


分析各行情况

In [8]:
trainDf.describe()

name,type,count,unique,nulls,top,freq,mean,std,min,median,max
SaleID,Double,150000,150000,0,0.0,1,74999.5,43301.414527,0.0,74999.5,149999.0
name,Double,150000,99662,0,708.0,282,68349.172873,61103.875095,0.0,51638.0,196812.0
regDate,Double,150000,3894,0,20000008.0,180,20034170.51218,53649.879255,19910001.0,20030912.0,20151212.0
model,Double,150000,249,0,0.0,11762,47.1287,49.53603,-1.0,30.0,247.0
brand,Double,150000,40,0,0.0,31480,8.052733,7.864956,0.0,6.0,39.0
bodyType,Double,150000,9,0,0.0,41420,1.708487,1.798313,-1.0,1.0,7.0
fuelType,Double,150000,8,0,0.0,91656,0.296227,0.621953,-1.0,0.0,6.0
gearbox,Double,150000,3,0,0.0,111623,0.1761,0.47417,-1.0,0.0,1.0
power,Double,150000,566,0,0.0,12829,119.316547,177.168419,0.0,110.0,19312.0
kilometer,Double,150000,13,0,15.0,96877,12.59716,3.919576,0.5,15.0,15.0


定义离散变量和连续变量

In [9]:
val num_cols = listOf(
    "power",
    "kilometer",
    "v_0",
    "v_1",
    "v_2",
    "v_3",
    "v_4",
    "v_5",
    "v_6",
    "v_7",
    "v_8",
    "v_9",
    "v_10",
    "v_11",
    "v_12",
    "v_13",
    "v_14"
)
val cate_cols = listOf("model", "brand", "bodyType", "fuelType")
val boolean_cols = listOf("gearbox", "seller", "notRepairedDamage", "offerType")

简易EDA

In [10]:
num_cols.map { col ->
    val data = trainDf[col].values.groupingBy { it }.eachCount()
    plot {
        line {
            x(data.keys, name = col)
            y(data.values, name = "price")
            color = Color.BLUE
        }
    }
}[0]

In [11]:
cate_cols.map { col ->
    val data = trainDf[col].values.groupingBy { it }.eachCount()
    plot {
        line {
            x(data.keys, name = col)
            y(data.values, name = "price")
            color = Color.BLUE
        }
    }
}[0]

In [12]:
boolean_cols.map { col ->
    val data = trainDf[col].values.groupingBy { it }.eachCount()
    plot {
        line {
            x(data.keys, name = col)
            y(data.values, name = "price")
            color = Color.BLUE
        }
    }
}[0]

对离散变量进行标签特征编码

In [13]:
class LabelEncoding() {
    val map = mutableMapOf<Double, Double>()
    var max = 0.0
    fun get(key: Double) =
        if (map.containsKey(key))
            map[key]
        else {
            max++
            map[key] = max
            max
        }
}

In [14]:
cate_cols.forEach { colName ->
    val labelEncoding = LabelEncoding()
    val col = trainDf.getColumn(colName)
    trainDf = trainDf.sortBy { col }.update { col }.with { labelEncoding.get(it as Double) }
}
cate_cols.forEach { colName ->
    val labelEncoding = LabelEncoding()
    val col = targetDf.getColumn(colName)
    targetDf = targetDf.sortBy { col }.update { col }.with { labelEncoding.get(it as Double) }
}

对数字变量进行归一化

In [15]:
num_cols.forEach { colName ->
    val col = trainDf.getColumn(colName)
    val min = col.minBy { it as Double } as Double
    val max = col.maxBy { it as Double } as Double
    trainDf = trainDf.update { col }.with { ((it as Double) - min) / (max - min) }
}
num_cols.forEach { colName ->
    val col = targetDf.getColumn(colName)
    val min = col.minBy { it as Double } as Double
    val max = col.maxBy { it as Double } as Double
    targetDf = targetDf.update { col }.with { ((it as Double) - min) / (max - min) }
}

获取数据并切割训练集与测试集

In [16]:
val data = trainDf.select(*(num_cols + cate_cols + boolean_cols).toTypedArray())
    .rows().map {
        it.toMap().map { it.value as Double }
    } to
        trainDf["price"].toList().map { listOf(it as Double) }

val (trainData, testData) = data.first.zip(data.second).shuffled().let {
    it.drop((it.size * 0.1).toInt()) to it.dropLast((it.size * 0.9).toInt())
}

In [17]:
val trainX = mk.ndarray(trainData.map { it.first })
val trainY = mk.ndarray(trainData.map { it.second })
val testX = mk.ndarray(testData.map { it.first })
val testY = mk.ndarray(testData.map { it.second })

定义模型

定义训练轮次

In [23]:
val model = Model(
    Dense((num_cols + cate_cols + boolean_cols).size, 128, initialize = HeUniformInitialize()),
    LeakyRelu(),
    Dropout(0.2),
    Dense(128, 256, initialize = HeUniformInitialize()),
    LeakyRelu(),
    Dropout(0.2),
    Dense(256, 256, initialize = HeUniformInitialize()),
    LeakyRelu(),
    Dropout(0.2),
    Dense(256, 256, initialize = HeUniformInitialize()),
    LeakyRelu(),
    Dropout(0.2),
    Dense(256, 128, initialize = HeUniformInitialize()),
    LeakyRelu(),
    Dropout(0.2),
    Dense(128, 1, initialize = HeUniformInitialize()),
    SoftPlus(base = 1.01,maxClip = 10000.0),
    loss = Mse(),
    optimizer = Adam(),
    scheduler = ExponentialScheduler(1e-2, dropRate = 1 - 1e-2)
)

In [24]:
val trainEpochs = (1..20)

开始训练！！！！

In [25]:
val lossList = mutableListOf<Double>()
val evaluateList = mutableListOf<Double>()

In [26]:
val loss = model.evaluate(trainX, trainY)
val evaluate = model.evaluate(testX, testY)
println("predect=${model.predict(testX)[0]} trueValue=${testY.get(0)}")
evaluateList.add(evaluate)
println("success train in 0 epochs, with evaluate=$evaluate")
println("-------------------------------------------------------------------------------------------")

predect=[69.62692056327121] trueValue=[6850.0]
success train in 0 epochs, with evaluate=9.01264827004627E7
-------------------------------------------------------------------------------------------


In [27]:
trainEpochs.forEach { 
    val loss = model.fitWithBatchSize(trainX, trainY, 100)
    lossList.add(loss)
    val evaluate = model.evaluate(testX, testY)
    println("predect=${model.predict(testX)[0]} trueValue=${testY.get(0)}")
    evaluateList.add(evaluate)
    println("success train in $it epochs, with loss=$loss and evaluate=$evaluate")
    println("-------------------------------------------------------------------------------------------")
}

predect=[69.68320607083213] trueValue=[6850.0]
success train in 1 epochs, with loss=9.059885393803729E7 and evaluate=9.00509361887814E7
-------------------------------------------------------------------------------------------
predect=[69.66022382006551] trueValue=[6850.0]
success train in 2 epochs, with loss=9.05988487648708E7 and evaluate=9.005092829976445E7
-------------------------------------------------------------------------------------------
predect=[69.65957874765537] trueValue=[6850.0]
success train in 3 epochs, with loss=9.059884547666517E7 and evaluate=9.005092614802937E7
-------------------------------------------------------------------------------------------
predect=[69.65972705396773] trueValue=[6850.0]
success train in 4 epochs, with loss=9.059884824582565E7 and evaluate=9.00509291182353E7
-------------------------------------------------------------------------------------------
predect=[69.66054290092745] trueValue=[6850.0]
success train in 5 epochs, 

org.jetbrains.kotlinx.jupyter.exceptions.ReplInterruptedException: The execution was interrupted

绘制 loss 曲线

In [28]:
dataFrameOf(
    "epochs" to lossList.mapIndexed { index, _ -> index } + evaluateList.mapIndexed { index, _ -> index },
    "loss" to lossList + evaluateList,
    "category" to lossList.map { "train" } + evaluateList.map { "test" }
).plot {
    line {
        x("epochs")
        y("loss")
        color("category")
    }
}

对目标进行预测并保存预测结果

In [27]:
val rows = targetDf.select(*(listOf("SaleID") + num_cols + cate_cols + boolean_cols).toTypedArray()).rows().map {
    val id = it.SaleID.toInt()
    val result = model.predictOne(mk.ndarray(it.values().drop(1).map { it as Double }))[0].toInt()
    listOf(id, result)
}
csvWriter().writeAll(rows, targetFileName = "res/result.csv")