# 数据分析

In [17]:
import pandas as pd
import numpy as np
import os 


In [18]:
data_dir = '/media/data1/'
train_data_path = os.path.join(data_dir, 'SLTA_Projects/dataset/regression/regression_train.csv')
eval_data_path = os.path.join(data_dir,  'SLTA_Projects/dataset/regression/regression_val.csv')
test_data_path = os.path.join(data_dir, 'SLTA_Projects/dataset/regression/regression_test.csv')
output_csv_path = os.path.join(data_dir, 'regression_test.csv')

In [2]:
train_data_df = pd.read_csv(train_data_path)
eval_data_df = pd.read_csv(eval_data_path)
test_data_df = pd.read_csv(test_data_path)

In [3]:
train_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   x       10000 non-null  float64
 1   y       10000 non-null  float64
dtypes: float64(2)
memory usage: 156.4 KB


In [4]:
train_data_df.shape, eval_data_df.shape, test_data_df.shape

((10000, 2), (1000, 2), (1000, 2))

In [5]:
train_data_df

Unnamed: 0,x,y
0,-20.060964,-1743.874180
1,7.716890,108.278050
2,-15.524765,-1405.400035
3,35.701119,-4665.541541
4,-21.343111,-2032.740506
...,...,...
9995,29.648693,-3155.747546
9996,-20.338434,-1783.132457
9997,9.693818,-58.934326
9998,33.387630,-3668.760857


In [6]:
train_data_df.iloc[:, :-1]

Unnamed: 0,x
0,-20.060964
1,7.716890
2,-15.524765
3,35.701119
4,-21.343111
...,...
9995,29.648693
9996,-20.338434
9997,9.693818
9998,33.387630


In [7]:
test_data_df.iloc[:, : - 1]

Unnamed: 0,x
0,-42.189131
1,-31.017622
2,46.536153
3,-12.094295
4,23.161852
...,...
995,-25.040736
996,15.501820
997,0.254766
998,-8.634745


# 训练集、验证集、测试集划分

In [8]:
# 训练、验证、测试数据集
X_train, y_train = train_data_df.iloc[:, :-1], train_data_df.iloc[:, -1]
X_eval, y_eval = eval_data_df.iloc[:, :-1], eval_data_df.iloc[:, -1]
X_test = test_data_df.iloc[:, : - 1]

# 使用python实现线性回归模型

In [9]:
#  linear regression model
class LinearRegression:
    def __init__(self, learning_rate=0.01, n_iterations=3000):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.weights = None
        self.bias = None
 
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0
 
        for _ in range(self.n_iterations):
            y_predicted = np.dot(X, self.weights) + self.bias
            dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y))
            db = (1 / n_samples) * np.sum(y_predicted - y)
            dw = np.clip(dw, -1, 1)  
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db
 
    def predict(self, X):
        return np.dot(X, self.weights) + self.bias

# 模型训练、验证、预测

In [10]:
import numpy as np
from sklearn.metrics import mean_squared_error
model = LinearRegression(
    learning_rate=0.01
    ,n_iterations=5000
    )
model.fit(X_train, y_train)


In [11]:
y_eval.head()

0    -331.519744
1   -3111.042517
2   -8553.950586
3   -1513.954631
4   -8449.564826
Name: y, dtype: float64

In [12]:
# 在验证集上预测
eval_pred = model.predict(X_eval)
# 在验证集上计算MSE
mse = mean_squared_error(y_eval, eval_pred)
print("MES:", mse)


MES: 9209846.206209619


In [13]:
test_pred = model.predict(X_test)
test_pred = pd.DataFrame(test_pred)
test_pred

Unnamed: 0,0
0,-3954.088323
1,-3789.783052
2,-2649.158780
3,-3511.467703
4,-2992.936946
...,...
995,-3701.877835
996,-3105.597082
997,-3329.843547
998,-3460.586269


In [14]:
test_data_df['y'] = test_pred

In [15]:
test_data_df

Unnamed: 0,x,y
0,-42.189131,-3954.088323
1,-31.017622,-3789.783052
2,46.536153,-2649.158780
3,-12.094295,-3511.467703
4,23.161852,-2992.936946
...,...,...
995,-25.040736,-3701.877835
996,15.501820,-3105.597082
997,0.254766,-3329.843547
998,-8.634745,-3460.586269


# CSV Output:

In [16]:
test_data_df.to_csv(output_csv_path, index=None)