In [53]:
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
from scipy import stats
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

import pandas as pd
import numpy as np
import warnings
from tqdm import tqdm
warnings.filterwarnings("ignore")

In [54]:
apps = pd.read_csv('../input_data/apps.tsv', delimiter='\t',encoding='utf-8')
user_history = pd.read_csv('../input_data/user_history.tsv', delimiter='\t',encoding='utf-8')
jobs = pd.read_csv('../input_data/jobs.tsv', delimiter='\t',encoding='utf-8', on_bad_lines="skip")
users = pd.read_csv('../input_data/users.tsv' ,delimiter='\t',encoding='utf-8')
test_users = pd.read_csv('../input_data/test_users.tsv', delimiter='\t',encoding='utf-8')

**Window = 6**

In our project, we are using only a subset of the original dataset, due to its large nature. We will only be using the data from one of the windows (here window 6)

In [55]:
user_history[user_history.WindowID==6]

Unnamed: 0,UserID,WindowID,Split,Sequence,JobTitle
1337041,13,6,Test,1,Pennsylvania Mentor
1337042,13,6,Test,2,Student Worker
1337043,13,6,Test,3,Internship in Adoption Unit
1337044,13,6,Test,4,
1337045,13,6,Test,5,Student Worker - Continuing Education
...,...,...,...,...,...
1530889,1472087,6,Train,3,GloBull Ambassador
1530890,1472087,6,Train,4,Research Assistant for Head Start Study
1530891,1472087,6,Train,5,Volunteer
1530892,1472087,6,Train,6,Customer Service Associate


In [56]:
jobs[jobs.WindowID==6].info()

<class 'pandas.core.frame.DataFrame'>
Index: 115998 entries, 861371 to 977368
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   JobID         115998 non-null  int64 
 1   WindowID      115998 non-null  int64 
 2   Title         115996 non-null  object
 3   Description   115997 non-null  object
 4   Requirements  115923 non-null  object
 5   City          115998 non-null  object
 6   State         115998 non-null  object
 7   Country       115998 non-null  object
 8   Zip5          71528 non-null   object
 9   StartDate     115998 non-null  object
 10  EndDate       115998 non-null  object
dtypes: int64(2), object(9)
memory usage: 10.6+ MB


In [57]:
users[users.WindowID==6].info()

<class 'pandas.core.frame.DataFrame'>
Index: 43334 entries, 296639 to 339972
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   UserID                43334 non-null  int64  
 1   WindowID              43334 non-null  int64  
 2   Split                 43334 non-null  object 
 3   City                  43334 non-null  object 
 4   State                 43276 non-null  object 
 5   Country               43334 non-null  object 
 6   ZipCode               43142 non-null  object 
 7   DegreeType            32034 non-null  object 
 8   Major                 32428 non-null  object 
 9   GraduationDate        29703 non-null  object 
 10  WorkHistoryCount      43334 non-null  int64  
 11  TotalYearsExperience  41733 non-null  float64
 12  CurrentlyEmployed     40653 non-null  object 
 13  ManagedOthers         43334 non-null  object 
 14  ManagedHowMany        43334 non-null  int64  
dtypes: float64(1), int

In [58]:
user_set = users[(users.WindowID==6) & (users.Country=="US")].dropna(axis=0,subset=["Major", "TotalYearsExperience", "CurrentlyEmployed", "DegreeType"])
user_set.Split.value_counts(), user_set.info()

<class 'pandas.core.frame.DataFrame'>
Index: 26006 entries, 296639 to 339971
Data columns (total 15 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   UserID                26006 non-null  int64  
 1   WindowID              26006 non-null  int64  
 2   Split                 26006 non-null  object 
 3   City                  26006 non-null  object 
 4   State                 26006 non-null  object 
 5   Country               26006 non-null  object 
 6   ZipCode               25971 non-null  object 
 7   DegreeType            26006 non-null  object 
 8   Major                 26006 non-null  object 
 9   GraduationDate        20566 non-null  object 
 10  WorkHistoryCount      26006 non-null  int64  
 11  TotalYearsExperience  26006 non-null  float64
 12  CurrentlyEmployed     26006 non-null  object 
 13  ManagedOthers         26006 non-null  object 
 14  ManagedHowMany        26006 non-null  int64  
dtypes: float64(1), int

(Split
 Train    25021
 Test       985
 Name: count, dtype: int64,
 None)

As shown in the preprocess.ipynb file, the number of US apps are very high. We will only be considering US applications.<br>
Furthermore, we willbe dropping all such users, who do not have a user_history attached to them.

In [59]:
job_set = jobs[(jobs.WindowID==6) & (jobs.Country=="US")].dropna(axis=0,subset=["Description","Requirements","Title"])
job_set.info()

<class 'pandas.core.frame.DataFrame'>
Index: 115684 entries, 861371 to 977368
Data columns (total 11 columns):
 #   Column        Non-Null Count   Dtype 
---  ------        --------------   ----- 
 0   JobID         115684 non-null  int64 
 1   WindowID      115684 non-null  int64 
 2   Title         115684 non-null  object
 3   Description   115684 non-null  object
 4   Requirements  115684 non-null  object
 5   City          115684 non-null  object
 6   State         115684 non-null  object
 7   Country       115684 non-null  object
 8   Zip5          71502 non-null   object
 9   StartDate     115684 non-null  object
 10  EndDate       115684 non-null  object
dtypes: int64(2), object(9)
memory usage: 10.6+ MB


In [60]:
user_id = user_set.UserID.unique().tolist()
job_id = job_set.JobID.unique().tolist()
# get work history of users present in the user_set
work_history = user_history[user_history.UserID.isin(user_id)]
work_history.dropna(axis=0,subset=["JobTitle"], inplace=True)

# filtering job applications data to only get applications of users and jobs present in the user_set and job_set
application_record = apps[(apps.UserID.isin(user_id))&(apps.JobID.isin(job_id))]

         UserID  WindowID  Split          ApplicationDate   JobID
1247132      13         6   Test  2012-06-19 15:36:38.583  821691
1247136      64         6  Train  2012-06-06 14:32:43.753  666073
1247137      64         6  Train  2012-06-06 14:18:55.773  281940
1247138      64         6  Train  2012-06-06 14:40:26.137  337025
1247139     101         6  Train  2012-06-06 11:47:59.313  949251


In [61]:
#further reducing the user and job set to unique values 
work_user_id = work_history.UserID.unique()
application_user_id = application_record.UserID.unique()
user_set = user_set[(user_set.UserID.isin(work_user_id))&(user_set.UserID.isin(application_user_id))]
user_id = user_set.UserID.unique()
application_record = application_record[application_record.UserID.isin(user_id)]
work_history = work_history[work_history.UserID.isin(user_id)]

In [62]:
user_set.to_csv("users.csv", index=False)
application_record.to_csv("application_record.csv", index=False)
work_history.to_csv("work_history.csv", index=False)
job_set.to_csv("jobs.csv", index=False)

**Negative-sampling**

In [63]:
#all the jobs the user has applied to grouped by user-id
final_apps = pd.DataFrame(columns = ["UserID","JobID","label"])
job_id = job_set.JobID.unique().tolist()
groups = application_record.groupby("UserID")
user_ids = []
job_ids = []
labels = []

print(groups)
i=0
for id, group in tqdm(groups):
    print(group)
    # print()
    # print()
    # print()
    # print(id)
    i=i+1
    if i==5:
        break

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x11fd38210>


  0%|          | 4/22133 [00:00<01:26, 254.59it/s]

         UserID  WindowID Split          ApplicationDate   JobID
1247132      13         6  Test  2012-06-19 15:36:38.583  821691
         UserID  WindowID  Split          ApplicationDate   JobID
1247136      64         6  Train  2012-06-06 14:32:43.753  666073
1247137      64         6  Train  2012-06-06 14:18:55.773  281940
1247138      64         6  Train  2012-06-06 14:40:26.137  337025
         UserID  WindowID  Split          ApplicationDate   JobID
1247139     101         6  Train  2012-06-06 11:47:59.313  949251
         UserID  WindowID  Split          ApplicationDate  JobID
1247140     133         6  Train  2012-06-05 11:33:45.903  17494
         UserID  WindowID  Split          ApplicationDate    JobID
1247150     182         6  Train  2012-06-11 00:21:47.237  1098447
1247151     182         6  Train  2012-06-06 18:34:33.303   428902





In [64]:
for idx, group in tqdm(groups):
    size = len(group)
    exist_job = group.JobID.unique().tolist()
    candidate_job = [i for i in job_id if i not in exist_job ]
    sample_job = np.random.randint(0,len(candidate_job),size)
    user_ids.extend([idx] * 2 * size)
    exist_job.extend([candidate_job[i] for i in sample_job])
    job_ids.extend(exist_job)
    label = [1] * size
    label.extend([0] * size)
    labels.extend(label)

final_apps.UserID = user_ids
final_apps.JobID = job_ids
final_apps.label = labels

100%|██████████| 22133/22133 [01:20<00:00, 275.39it/s]


In [65]:
final_apps.label.value_counts()

label
1    103657
0    103657
Name: count, dtype: int64

In [66]:
final_apps.to_csv("apps.csv",index=False)

**Cleaning Jobs Dataframe**

In [67]:
jobs = pd.read_csv("jobs.csv")

In [68]:
import re

removePattern = r'(<(.*?)>)|(&\w+)'
addSpacePattern = r'([;:])|(\\r)|(\\n)'
removeExtraSpaces = r'(\s\s+?)(?=\S)'

jobs['DescCleaned'] = jobs['Description'].astype(str).str.lower()
jobs['DescCleaned'] = jobs['DescCleaned'].apply(lambda x: re.sub(removePattern, "", x))
jobs['DescCleaned'] = jobs['DescCleaned'].apply(lambda x: re.sub(addSpacePattern, " ", x))
jobs['DescCleaned'] = jobs['DescCleaned'].apply(lambda x: re.sub(removeExtraSpaces, " ", x))
# Similarly for requirements
jobs['ReqCleaned'] = jobs['Requirements'].astype(str).str.lower()
jobs['ReqCleaned'] = jobs['ReqCleaned'].apply(lambda x: re.sub(removePattern, "", x))
jobs['ReqCleaned'] = jobs['ReqCleaned'].apply(lambda x: re.sub(addSpacePattern, " ", x))
jobs['ReqCleaned'] = jobs['ReqCleaned'].apply(lambda x: re.sub(removeExtraSpaces, " ", x))


In [69]:
jobs.drop(columns=['Description', 'Requirements'], inplace=True)

In [70]:
jobs.to_csv("jobs.csv", index=False)

# Build Train and Test datasets

In [71]:
apps = pd.read_csv('apps.csv')
jobs = pd.read_csv('jobs.csv')
users = pd.read_csv('users.csv')
work_history = pd.read_csv('work_history.csv')

In [72]:
jobs = jobs.fillna(" ")
jobs["word"] = jobs.Title + jobs.DescCleaned + jobs.ReqCleaned
tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=5, max_features=100, stop_words='english')
tfidf_matrix = tf.fit_transform(jobs['word'])

**TRIAL 1**

Here, I am removing all users who have more than 20 applications.

In [73]:
temp = sorted(dict(apps.UserID.value_counts()).items(), key=lambda x: x[1], reverse=True)
exclude_user_id = [i[0] for i in temp if i [1]>=20]
len(exclude_user_id)

2320

In [74]:
apps = apps[~apps.UserID.isin(exclude_user_id)]

In [75]:
user_id = apps.UserID.unique()
work_history = work_history[work_history.UserID.isin(user_id)]
users = users[users.UserID.isin(user_id)]
users.reset_index(drop=True, inplace=True)

We should drop any duplicates in work_history (I saw a few of them)

In [76]:
work_history = work_history.drop(columns=["Sequence"]).drop_duplicates()

In [77]:
work_history

Unnamed: 0,UserID,WindowID,Split,JobTitle
0,13,6,Test,Pennsylvania Mentor
1,13,6,Test,Student Worker
2,13,6,Test,Internship in Adoption Unit
3,13,6,Test,Student Worker - Continuing Education
4,13,6,Test,Sales Associate
...,...,...,...,...
97339,1471948,6,Train,Assistant (P/T)
97340,1471948,6,Train,Phone Sales
97341,1472019,6,Train,Supply Admin Clerk/ Combat Marksmanship
97342,1472066,6,Train,Manager


In [78]:
def sum_with_space(series):
    return ' '.join(series)

work_history_tf = TfidfVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0.0, max_features=50, stop_words='english')
work_history_tf_matrix = work_history_tf.fit_transform(work_history.groupby('UserID')['JobTitle'].agg(sum_with_space).values)

**Clean the users and jobs dataset**

In [79]:
users = users.drop(columns=["Country","ZipCode","Major","GraduationDate","WindowID"])

In [80]:
users.replace({"CurrentlyEmployed":{"Yes":1,"No":0}}, inplace=True)
users.replace({"ManagedOthers":{"Yes":1,"No":0}}, inplace=True)
users.replace({"DegreeType":{"None":0,"High School":1, "Vocational":2, "Associate's":3, "Bachelor's":4, "Master's":5, "PhD":6}}, 
                 inplace=True)

In [81]:
user_city_dict = dict(zip(users['UserID'], users['City']))
user_state_dict = dict(zip(users['UserID'], users['State']))

jobs_city_dict = dict(zip(jobs['JobID'], jobs['City']))
jobs_state_dict = dict(zip(jobs['JobID'], jobs['State']))

In [82]:
city = []
state = []
for index, row in tqdm(apps.iterrows(), total=len(apps)):
    city.append(1 if jobs_city_dict[row['JobID']] == user_city_dict[row['UserID']] else 0)
    state.append(1 if jobs_state_dict[row['JobID']] == user_state_dict[row['UserID']] else 0)

apps["City"] = city
apps["State"] = state

100%|██████████| 107176/107176 [00:01<00:00, 87845.28it/s]


In [83]:
apps.drop_duplicates(inplace=True)

In [84]:
train_user = users[users.Split=="Train"].UserID.values
test_user = users[users.Split=="Test"].UserID.values
train_data = apps[apps.UserID.isin(train_user)]
test_data = apps[apps.UserID.isin(test_user)]

In [85]:
train_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 104700 entries, 2 to 207313
Data columns (total 5 columns):
 #   Column  Non-Null Count   Dtype
---  ------  --------------   -----
 0   UserID  104700 non-null  int64
 1   JobID   104700 non-null  int64
 2   label   104700 non-null  int64
 3   City    104700 non-null  int64
 4   State   104700 non-null  int64
dtypes: int64(5)
memory usage: 4.8 MB


In [86]:
groups = train_data.groupby("UserID")
X_train = np.zeros((1,158))
Y_train = []
for u_id, group in tqdm(groups):
    # print(u_id)
    user = users[users.UserID==u_id][["DegreeType", "WorkHistoryCount", "TotalYearsExperience", "CurrentlyEmployed", 
                                            "ManagedOthers", "ManagedHowMany"]]
    u_idx = user.index.values[0]

    # print(u_idx)
    
    user_feature = np.concatenate((user.values, work_history_tf_matrix[u_idx,:].toarray()),axis=1)
    job_id_list = group.JobID.values   #all the jobs the user has applied / not applied to 
    temp_jobs = jobs[jobs.JobID.isin(job_id_list)]  # from all jobs, get the jobs from the job_id_list
    j_idx = temp_jobs.index.values
    f = []
    for i in j_idx:
        feature = np.concatenate((user_feature, tfidf_matrix[i,:].toarray()), axis=1).reshape(156,).tolist()
        feature = np.concatenate((feature, ))
        f.append(feature)
    # print(len(group[["City","State"]].values), ' ', len(temp_jobs))
    feature = np.concatenate((group[["City","State"]].values, np.array(f)),axis=1)
    X_train = np.concatenate((X_train, feature), axis=0)
    Y_train.extend(group.label.values.tolist())
X_train = X_train[1:]

100%|██████████| 19473/19473 [03:13<00:00, 100.62it/s]


In [87]:
X_train.shape, len(Y_train)

((104700, 158), 104700)

In [88]:
groups = test_data.groupby("UserID")
X_test = np.zeros((1,158))
Y_test = []
for u_id, group in tqdm(groups):
    user = users[users.UserID==u_id][["DegreeType", "WorkHistoryCount", "TotalYearsExperience", "CurrentlyEmployed", 
                                            "ManagedOthers", "ManagedHowMany"]]
    u_idx = user.index.values[0]

    user_feature = np.concatenate((user.values, work_history_tf_matrix[u_idx,:].toarray()),axis=1) # UI

    job_id_list = group.JobID.values
    temp_jobs = jobs[jobs.JobID.isin(job_id_list)] # job table
    
    j_idx = temp_jobs.index.values
    f = []
    for i in j_idx:
        feature = np.concatenate((user_feature, tfidf_matrix[i,:].toarray()), axis=1).reshape(156,).tolist()
        f.append(feature)
    feature = np.concatenate((group[["City","State"]].values, np.array(f)),axis=1)
    X_test = np.concatenate((X_test, feature), axis=0)
    Y_test.extend(group.label.values.tolist())
X_test = X_test[1:]

100%|██████████| 340/340 [00:00<00:00, 486.26it/s]


In [119]:
X_test

array([[0.        , 1.        , 4.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 4.        , ..., 0.14426976, 0.        ,
        0.        ],
       [0.        , 1.        , 1.        , ..., 0.16595852, 0.        ,
        0.12169042],
       ...,
       [0.        , 0.        , 4.        , ..., 0.09134875, 0.        ,
        0.05023666],
       [0.        , 0.        , 4.        , ..., 0.48112811, 0.07308761,
        0.        ],
       [0.        , 0.        , 4.        , ..., 0.17415049, 0.25132248,
        0.19154588]])

### Models

In [89]:
def show_result(y_true, y_prediction):
    report = classification_report(y_true,y_prediction,digits=4)
    report = report.splitlines()
    columns = ['class'] + report[0].split()
    col_1, col_2, col_3, col_4, col_5 = [], [], [], [], []
    for row in report[1:]:
        if len(row.split()) != 0:
            row = row.split()
            if len(row) < 5:
                col_1.append(row[0])
                col_2.append('')
                col_3.append('')
                col_4.append(row[1])
                col_5.append(row[2])
            elif len(row) > 5:
                col_1.append(row[0] + ' ' + row[1])
                col_2.append(row[2])
                col_3.append(row[3])
                col_4.append(row[4])
                col_5.append(row[5])
            else:
                col_1.append(row[0])
                col_2.append(row[1])
                col_3.append(row[2])
                col_4.append(row[3])
                col_5.append(row[4])
    col_1.append("overall")
    col_2.append(precision_score(y_true, y_prediction))
    col_3.append(recall_score(y_true, y_prediction))
    col_4.append(f1_score(y_true, y_prediction))
    col_5.append(roc_auc_score(y_true, y_prediction))
    result = pd.DataFrame()
    result[columns[0]] = col_1
    result[columns[1]] = col_2
    result[columns[2]] = col_3
    result[columns[3]] = col_4
    result[columns[4]] = col_5
    print("——————Test——————")
    print(result)

In [90]:
lr = LinearRegression()
lr.fit(X_train, Y_train)
y_pred = lr.predict(X_test)
y_pred = [0 if i<0.5 else 1 for i in y_pred]
show_result(Y_test, y_pred)

——————Test——————
          class precision    recall  f1-score   support
0             0    0.8614    0.9588    0.9075      1238
1             1    0.9536    0.8457    0.8964      1238
2      accuracy                        0.9023      2476
3     macro avg    0.9075    0.9023    0.9019      2476
4  weighted avg    0.9075    0.9023    0.9019      2476
5       overall  0.953552  0.845719  0.896404  0.902262


In [91]:
import numpy as np

# Convert X_train and Y_train to numpy arrays if they are not already
X_train = np.array(X_train)
Y_train = np.array(Y_train)
Y_train = Y_train.reshape(-1,1)

# Similarly, convert X_test and Y_test if needed
X_test = np.array(X_test)
Y_test = np.array(Y_test)

In [117]:
X_test

array([[0.        , 1.        , 4.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 4.        , ..., 0.14426976, 0.        ,
        0.        ],
       [0.        , 1.        , 1.        , ..., 0.16595852, 0.        ,
        0.12169042],
       ...,
       [0.        , 0.        , 4.        , ..., 0.09134875, 0.        ,
        0.05023666],
       [0.        , 0.        , 4.        , ..., 0.48112811, 0.07308761,
        0.        ],
       [0.        , 0.        , 4.        , ..., 0.17415049, 0.25132248,
        0.19154588]])

In [103]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, Dropout
from tqdm import tqdm

# Define the model architecture
model = Sequential()
model.add(Dense(256, input_dim=X_train.shape[1], activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1, activation='sigmoid'))  # Binary classification

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, Y_train, epochs=10, batch_size=32, validation_data=(X_test, Y_test))

# Evaluate the model on test data
scores = model.evaluate(X_test, Y_test)
print(f"Test loss: {scores[0]}")
print(f"Test accuracy: {scores[1]}")

# Make predictions on the test data
y_pred_nueral = model.predict(X_test)
# y_pred_nueral = (y_pred_nueral > 0.5).astype(int)



Epoch 1/10
[1m3272/3272[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 893us/step - accuracy: 0.6839 - loss: 0.6338 - val_accuracy: 0.8845 - val_loss: 0.3174
Epoch 2/10
[1m3272/3272[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 832us/step - accuracy: 0.9046 - loss: 0.3336 - val_accuracy: 0.9023 - val_loss: 0.3067
Epoch 3/10
[1m3272/3272[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 885us/step - accuracy: 0.9079 - loss: 0.3153 - val_accuracy: 0.9023 - val_loss: 0.3026
Epoch 4/10
[1m3272/3272[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 838us/step - accuracy: 0.9067 - loss: 0.3153 - val_accuracy: 0.9023 - val_loss: 0.3046
Epoch 5/10
[1m3272/3272[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 850us/step - accuracy: 0.9086 - loss: 0.3087 - val_accuracy: 0.9015 - val_loss: 0.3010
Epoch 6/10
[1m3272/3272[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 832us/step - accuracy: 0.9095 - loss: 0.3065 - val_accuracy: 0.8934 - val_loss: 0.3139
Epoc

In [104]:
show_result(Y_test, y_pred_nueral)

——————Test——————
          class precision    recall  f1-score   support
0             0    0.8641    0.9556    0.9076      1238
1             1    0.9503    0.8498    0.8972      1238
2      accuracy                        0.9027      2476
3     macro avg    0.9072    0.9027    0.9024      2476
4  weighted avg    0.9072    0.9027    0.9024      2476
5       overall  0.950316  0.849758  0.897228  0.902666


In [135]:
from keras.models import save_model, load_model

# Save the model
model.save('keras_model.h5')



In [134]:
y_pred_nueral

array([[0.92766005],
       [0.13133308],
       [0.92488813],
       ...,
       [0.17713252],
       [0.17160837],
       [0.1896137 ]], dtype=float32)