In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
data = pd.read_excel("data10.xlsx")
data.head()

Unnamed: 0,FINANCIAL_REPORTING_MAIN_CODE,SL_CODE,dbt_value,crd_value
0,9706,111110413,3452797000.0,0.0
1,9706,111110414,19899680000.0,0.0
2,9706,111110415,679458600.0,0.0
3,9709,113121307,1652008000000.0,0.0
4,9709,113121308,2592063000000.0,0.0


In [3]:
data['SL_CODE'] = data['SL_CODE'].apply(str)

In [4]:
input_data = data["SL_CODE"]
output_data = data[["dbt_value", "crd_value"]]

In [5]:
cv = CountVectorizer()
input_data = cv.fit_transform(input_data)

In [6]:
def data_preprocessing_pipeline(data):
    #Identify numeric and categorical features
    numeric_features = data.select_dtypes(include=['float', 'int']).columns
#     categorical_features = data.select_dtypes(include=['object']).columns

    #Handle missing values in numeric features
    data[numeric_features] = data[numeric_features].fillna(data[numeric_features].mean())

    #Detect and handle outliers in numeric features using IQR
    for feature in numeric_features:
        Q1 = data[feature].quantile(0.25)
        Q3 = data[feature].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - (1.5 * IQR)
        upper_bound = Q3 + (1.5 * IQR)
        data[feature] = np.where((data[feature] < lower_bound) | (data[feature] > upper_bound),
                                 data[feature].mean(), data[feature])

    #Normalize numeric features
#     scaler = StandardScaler()
#     scaled_data = scaler.fit_transform(data[numeric_features])
#     data[numeric_features] = scaler.transform(data[numeric_features])

    #Handle missing values in categorical features
#     data[categorical_features] = data[categorical_features].fillna(data[categorical_features].mode().iloc[0])
    
    return data

In [7]:
output_data.head()

Unnamed: 0,dbt_value,crd_value
0,3452797000.0,0.0
1,19899680000.0,0.0
2,679458600.0,0.0
3,1652008000000.0,0.0
4,2592063000000.0,0.0


In [8]:
output_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11984 entries, 0 to 11983
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   dbt_value  11984 non-null  float64
 1   crd_value  11984 non-null  float64
dtypes: float64(2)
memory usage: 187.4 KB


In [9]:
output_data.describe()

Unnamed: 0,dbt_value,crd_value
count,11984.0,11984.0
mean,582667300000.0,270180800000.0
std,5778788000000.0,5816253000000.0
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,3929139000.0,435555500.0
max,159647600000000.0,284407700000000.0


In [10]:
output_data = data_preprocessing_pipeline(output_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[numeric_features] = data[numeric_features].fillna(data[numeric_features].mean())
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[feature] = np.where((data[feature] < lower_bound) | (data[feature] > upper_bound),
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data[feature] = np.where((data[

In [11]:
output_data.head()

Unnamed: 0,dbt_value,crd_value
0,3452797000.0,0.0
1,582667300000.0,0.0
2,679458600.0,0.0
3,582667300000.0,0.0
4,582667300000.0,0.0


In [12]:
output_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11984 entries, 0 to 11983
Data columns (total 2 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   dbt_value  11984 non-null  float64
 1   crd_value  11984 non-null  float64
dtypes: float64(2)
memory usage: 187.4 KB


In [13]:
output_data.describe()

Unnamed: 0,dbt_value,crd_value
count,11984.0,11984.0
mean,120872200000.0,57149950000.0
std,235730800000.0,110290900000.0
min,0.0,0.0
25%,0.0,0.0
50%,0.0,0.0
75%,3929139000.0,435555500.0
max,582667300000.0,270180800000.0


In [14]:
type(input_data), type(output_data)

(scipy.sparse._csr.csr_matrix, pandas.core.frame.DataFrame)

In [15]:
X_train, X_test, y_train, y_test = train_test_split(input_data, output_data, test_size=0.2, random_state=42)

In [16]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((9587, 2231), (2397, 2231), (9587, 2), (2397, 2))

In [22]:
X_train = torch.tensor(X_train.toarray()).type(torch.float32)
y_train = torch.tensor(y_train.values).type(torch.float32)
X_test = torch.tensor(X_test.toarray()).type(torch.float32)
y_test = torch.tensor(y_test.values).type(torch.float32)
X_train.size(), y_train.size()

(torch.Size([9587, 2231]), torch.Size([9587, 2]))

In [45]:
X_train = X_train.squeeze()

X_train.size()

torch.Size([9587, 2231])

In [39]:
class DBTCRD(nn.Module):
    def __init__(self):
        super().__init__() 
        
        self.linear1 = nn.Linear(1, 9)
        self.linear2 = nn.Linear(9, 2)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        output = self.linear2(self.relu(self.linear1(x)))
        return output

In [40]:
model = DBTCRD()

In [41]:
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.01)

In [42]:
X_train.size(), y_train.size()

(torch.Size([9587, 1, 2231]), torch.Size([9587, 2]))

In [43]:
epochs = 500

for epoch in range(epochs):
    model.train()
    X_preds = model(X_train)
    loss = loss_fn(X_preds, y_train)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
#     model2.eval()
#     with torch.no_grad():
#         y_preds = model2(X_test)
#         loss_test = loss_fn(y_preds, y_test)
    if epoch % 100 == 0:
        print(f"Epoch: {epoch} | Loss: {loss}")

RuntimeError: mat1 and mat2 shapes cannot be multiplied (9587x2231 and 1x9)

In [179]:
torch.manual_seed(42)

epochs = 3000

# Put data on the target device (device agnostic code for data) 
# X_train = X_train.to(device)
# y_train = y_train.to(device)
# X_test = X_test.to(device)
# y_test = y_test.to(device)

for epoch in range(epochs):
    model_1.train()

    # 1. Forward pass
    y_pred = model_1(X_train)

    # 2. Calculate the loss
    loss = loss_fn(y_pred, y_train)

    # 3. Optimizer zero grad
    optimizer.zero_grad()

    # 4. Perform backpropagation
    loss.backward()

    # 5. Optimizer step
    optimizer.step()

    ### Testing
#     model_1.eval()
#     with torch.inference_mode():
#     test_pred = model_1(X_test)

#     test_loss = loss_fn(test_pred, y_test)

    # Print out what's happening
    if epoch % 100 == 0: 
#     print(f"Epoch: {epoch} | Loss: {loss} | Test loss: {test_loss}")
        print(f"Epoch: {epoch} | Loss: {loss}")

Epoch: 0 | Loss: 88119746560.0
Epoch: 100 | Loss: 88119746560.0
Epoch: 200 | Loss: 88119746560.0
Epoch: 300 | Loss: 88119746560.0
Epoch: 400 | Loss: 88119746560.0
Epoch: 500 | Loss: 88119746560.0
Epoch: 600 | Loss: 88119746560.0
Epoch: 700 | Loss: 88119746560.0
Epoch: 800 | Loss: 88119746560.0
Epoch: 900 | Loss: 88119746560.0
Epoch: 1000 | Loss: 88119746560.0
Epoch: 1100 | Loss: 88119746560.0
Epoch: 1200 | Loss: 88119746560.0
Epoch: 1300 | Loss: 88119746560.0
Epoch: 1400 | Loss: 88119746560.0
Epoch: 1500 | Loss: 88119746560.0
Epoch: 1600 | Loss: 88119746560.0
Epoch: 1700 | Loss: 88119746560.0
Epoch: 1800 | Loss: 88119746560.0
Epoch: 1900 | Loss: 88119746560.0
Epoch: 2000 | Loss: 88119746560.0
Epoch: 2100 | Loss: 88119746560.0
Epoch: 2200 | Loss: 88119746560.0
Epoch: 2300 | Loss: 88119746560.0
Epoch: 2400 | Loss: 88119746560.0
Epoch: 2500 | Loss: 88119746560.0
Epoch: 2600 | Loss: 88119746560.0
Epoch: 2700 | Loss: 88119746560.0
Epoch: 2800 | Loss: 88119746560.0
Epoch: 2900 | Loss: 881197

In [222]:
data_jadid = data.groupby('SL_CODE',as_index=False).sum()
data_jadid.head()

Unnamed: 0,SL_CODE,FINANCIAL_REPORTING_MAIN_CODE,dbt_value,crd_value
0,101010101,77451,18756470000000.0,0.0
1,101010201,67848,3051579000000.0,0.0
2,101020101,67848,0.0,0.0
3,101030101,67848,820965700000.0,0.0
4,101040101,67848,0.0,0.0


In [43]:
X_train.shape

torch.Size([8388])

In [52]:
len(X_train.unique())

2187

In [59]:
inputs = X_train[i:i+batch_size]
len(inputs.unique())

64

In [73]:
from scipy.sparse import coo_matrix
coo = coo_matrix(([3,4,5], ([0,1,1], [2,0,2])), shape=(2,3))

values = coo.data
indices = np.vstack((coo.row, coo.col))

i = torch.LongTensor(indices)
v = torch.FloatTensor(values)
shape = coo.shape

torch.sparse.FloatTensor(i, v, torch.Size(shape)).to_dense()

tensor([[0., 0., 3.],
        [4., 0., 5.]])

In [74]:
coo.row

array([0, 1, 1], dtype=int32)