# 分类数据EDA

In [1]:
import pandas as pd
import numpy as np
import os 

data_dir = './'

train_data_path = os.path.join(data_dir, 'SLTA_Projects/dataset/regression/regression_train.csv')
eval_data_path = os.path.join(data_dir,  'SLTA_Projects/dataset/regression/regression_val.csv')
test_data_path = os.path.join(data_dir, 'SLTA_Projects/dataset/regression/regression_test.csv')

output_csv_path = os.path.join(data_dir, 'regression_test.csv')

In [2]:
train_data_df = pd.read_csv(train_data_path)
eval_data_df = pd.read_csv(eval_data_path)
test_data_df = pd.read_csv(test_data_path)

In [3]:
train_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   x       10000 non-null  float64
 1   y       10000 non-null  float64
dtypes: float64(2)
memory usage: 156.4 KB


In [5]:
eval_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   x       1000 non-null   float64
 1   y       1000 non-null   float64
dtypes: float64(2)
memory usage: 15.8 KB


In [6]:
test_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   x       1000 non-null   float64
 1   y       0 non-null      float64
dtypes: float64(2)
memory usage: 15.8 KB


In [7]:
train_data_df.shape, eval_data_df.shape, test_data_df.shape

((10000, 2), (1000, 2), (1000, 2))

In [8]:
train_data_df.head(5)

Unnamed: 0,x,y
0,-20.060964,-1743.87418
1,7.71689,108.27805
2,-15.524765,-1405.400035
3,35.701119,-4665.541541
4,-21.343111,-2032.740506


In [9]:
eval_data_df.head(5)

Unnamed: 0,x,y
0,8.739428,-331.519744
1,29.16881,-3111.042517
2,-44.642598,-8553.950586
3,-19.6347,-1513.954631
4,-44.507558,-8449.564826


In [10]:
test_data_df.head(5)

Unnamed: 0,x,y
0,-42.189131,
1,-31.017622,
2,46.536153,
3,-12.094295,
4,23.161852,


In [11]:
train_data_df.iloc[:, :-1]

Unnamed: 0,x
0,-20.060964
1,7.716890
2,-15.524765
3,35.701119
4,-21.343111
...,...
9995,29.648693
9996,-20.338434
9997,9.693818
9998,33.387630


In [13]:
test_data_df.iloc[:, : - 1]

Unnamed: 0,x
0,-42.189131
1,-31.017622
2,46.536153
3,-12.094295
4,23.161852
...,...
995,-25.040736
996,15.501820
997,0.254766
998,-8.634745


# 训练集、验证集、测试集划分

In [4]:
# 训练数据集
X_train, y_train = train_data_df.iloc[:, :-1], train_data_df.iloc[:, -1]
X_eval, y_eval = eval_data_df.iloc[:, :-1], eval_data_df.iloc[:, -1]
X_test = test_data_df.iloc[:, : - 1]

# 使用numpy 实现线性回归模型

In [5]:
# Define the linear regression model
class LinearRegression:
    def __init__(self, learning_rate=0.001, n_iterations=1000):
        self.learning_rate = learning_rate
        self.n_iterations = n_iterations
        self.weights = None
        self.bias = None
 
    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0
 
        for _ in range(self.n_iterations):
            y_predicted = np.dot(X, self.weights) + self.bias
 
            # Compute gradients
            dw = (1 / n_samples) * np.dot(X.T, (y_predicted - y))
            db = (1 / n_samples) * np.sum(y_predicted - y)
 
            # Update weights
            dw = np.clip(dw, -1, 1)  # 示例：将梯度限制在-1到1之间
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db
 
    def predict(self, X):
        return np.dot(X, self.weights) + self.bias

# 模型训练、验证、预测

In [16]:
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import mean_squared_error


# Initialize and train the model
model = LinearRegression(learning_rate=0.01, n_iterations=3000)
model.fit(X_train, y_train)


In [17]:
y_eval

0      -331.519744
1     -3111.042517
2     -8553.950586
3     -1513.954631
4     -8449.564826
          ...     
995   -8583.044698
996   -3666.074078
997   -5152.270035
998    -163.364393
999   -4498.016118
Name: y, Length: 1000, dtype: float64

In [19]:
# 在验证集上预测
eval_pred = model.predict(X_eval)

# 在验证集上计算MSE
mse = mean_squared_error(y_eval, eval_pred)
print("Mean Squared Error:", mse)


Mean Squared Error: 9209846.206213104


In [37]:
test_pred = model.predict(X_test)
test_pred = pd.DataFrame(test_pred)
test_pred

Unnamed: 0,0
0,-3754.828551
1,-3643.113463
2,-2867.575716
3,-3453.880195
4,-3101.318720
...,...
995,-3583.344602
996,-3177.919044
997,-3330.389584
998,-3419.284690


In [38]:
test_data_df['y'] = test_pred

In [39]:
test_data_df

Unnamed: 0,x,y
0,-42.189131,-3754.828551
1,-31.017622,-3643.113463
2,46.536153,-2867.575716
3,-12.094295,-3453.880195
4,23.161852,-3101.318720
...,...,...
995,-25.040736,-3583.344602
996,15.501820,-3177.919044
997,0.254766,-3330.389584
998,-8.634745,-3419.284690


# 输出结果文件

In [40]:
test_data_df.to_csv(output_csv_path, index=None)