In [1224]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

In [1225]:
df_tr = pd.read_csv('train.csv')
df_ts = pd.read_csv('test.csv')
df = pd.concat([df_tr, df_ts])
df

Unnamed: 0,Tectonic regime,Onshore/Offshore,Hydrocarbon type,Reservoir status,Structural setting,Depth,Period,Lithology,Gross,Netpay,Porosity,Permeability
0,STRIKE-SLIP/TRANSPRESSION/BASEMENT-I,OFFSHORE,OIL,DEVELOPING,INVERSION/WRENCH,3520,NEOGENE,SANDSTONE,2460.0,220.0,20.0,45.0
1,GRAVITY/EXTENSION/EVAPORITE,OFFSHORE,OIL,MATURE PRODUCTION,SALT/PASSIVE MARGIN,9967,CRETACEOUS,LIMESTONE,427.0,160.0,19.0,175.0
2,GRAVITY/EXTENSION/EVAPORITE,ONSHORE,OIL,MATURE PRODUCTION,PASSIVE MARGIN,8700,CRETACEOUS,LIMESTONE,95.0,15.0,12.0,20.0
3,COMPRESSION,ONSHORE,OIL,DECLINING PRODUCTION,THRUST,5084,CRETACEOUS,SANDSTONE,328.0,300.0,13.0,600.0
4,INVERSION/COMPRESSION/EXTENSION,ONSHORE,OIL,DECLINING PRODUCTION,INVERSION/RIFT,1030,CRETACEOUS,SANDSTONE,260.0,33.0,24.0,182.0
...,...,...,...,...,...,...,...,...,...,...,...,...
128,EXTENSION/EROSION,,OIL,DECLINING PRODUCTION,RIFT,5520,TRIASSIC-JURASSIC,SANDSTONE,630.0,394.0,26.0,1000.0
129,COMPRESSION/EROSION,,BITUMEN,CONTINUING DEVELOPMENT,FORELAND,1500,CRETACEOUS,SANDSTONE,100.0,82.0,28.0,440.0
130,COMPRESSION/STRIKE-SLIP/TRANSPRESSION/BASEMENT-I,,OIL,NEARLY DEPLETED,WRENCH/FOREARC,11100,NEOGENE,THINLY-BEDDED SANDSTONE,200.0,150.0,20.0,75.0
131,INVERSION/COMPRESSION/EXTENSION,,OIL,SECOND PLATEAU PRODUTION,RIFT/INVERSION,3939,PALEOGENE,SANDSTONE,410.0,20.0,28.0,1000.0


In [1226]:
# подготовка данных
aX = df.drop('Onshore/Offshore', axis=1)
ay = df['Onshore/Offshore']
X_transform = pd.get_dummies(aX, drop_first=True)
X_transform
df = pd.concat([X_transform, ay], axis=1)
df_mod_tr = df.dropna()
df_mod_ts = df[df['Onshore/Offshore'].isnull()]
df_mod_ts = df_mod_ts.drop('Onshore/Offshore', axis=1)

In [1227]:
df_mod_tr['Onshore/Offshore'].value_counts()

ONSHORE             211
OFFSHORE             93
ONSHORE-OFFSHORE      5
Name: Onshore/Offshore, dtype: int64

Данные плохо сбалансированны, accuracy не точна.

In [1228]:
X = df_mod_tr.drop('Onshore/Offshore', axis=1)
y = df_mod_tr['Onshore/Offshore']

In [1229]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state = 61)

In [1230]:
# KNN модель
knc = KNeighborsClassifier(n_neighbors=7)
knc.fit(X_train, y_train)
y_pred = knc.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    OFFSHORE       0.83      0.50      0.62        10
     ONSHORE       0.80      0.95      0.87        21

    accuracy                           0.81        31
   macro avg       0.82      0.73      0.75        31
weighted avg       0.81      0.81      0.79        31



  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [1231]:
# SVC модель + стандартизация данных 
scaler = StandardScaler()
X_scale = scaler.fit_transform(X)
X_train_s, X_test_s, y_train_s, y_test_s = train_test_split(X_scale, y, test_size=0.1, random_state=61)

svc_model = SVC(kernel='rbf', degree=5)
svc_model.fit(X_train_s, y_train_s)
y_pred = svc_model.predict(X_test_s)
print(classification_report(y_test_s, y_pred))

              precision    recall  f1-score   support

    OFFSHORE       0.75      0.30      0.43        10
     ONSHORE       0.74      0.95      0.83        21

    accuracy                           0.74        31
   macro avg       0.75      0.63      0.63        31
weighted avg       0.74      0.74      0.70        31



In [1232]:
# модель дерева решений
#parameters = {'max_depth': range(1, 100, 1)}
#dtcp = DecisionTreeClassifier()
#grid = GridSearchCV(dtcp, parameters)
#grid.fit(X_train, y_train)
#print(grid.best_estimator_ ) # --> 14+
dtc = DecisionTreeClassifier(max_depth=15)
dtc.fit(X_train, y_train)
y_pred = dtc.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

    OFFSHORE       0.75      0.60      0.67        10
     ONSHORE       0.83      0.90      0.86        21

    accuracy                           0.81        31
   macro avg       0.79      0.75      0.77        31
weighted avg       0.80      0.81      0.80        31



In [1233]:
# ансамблевый метод ExtraTrees
#parameters = {'n_estimators': range(1, 200, 10)}
#rfcp = RandomForestClassifier()
#grid = GridSearchCV(rfcp, parameters)
#grid.fit(X_train, y_train)
#print(grid.best_estimator_ )
accs = []
# среднее и мода по показателям accuracy
for _ in range(1000):
    X_train_r, X_test_r, y_train_r, y_test_r = train_test_split(X, y, test_size=0.01, random_state = None)
    rfc = ExtraTreesClassifier(n_estimators=20, max_depth=None)
    rfc.fit(X_train_r, y_train_r)
    y_pred_r = rfc.predict(X_test_r)
    accs.append(accuracy_score(y_test_r, y_pred_r))
ser = pd.Series(accs)
print(f'Mean: {ser.mean()}  Median: {ser.median()}  Mode: {ser.mode()}')

Mean: 0.8565  Median: 1.0  Mode: 0    1.0
dtype: float64


In [1234]:
# предикт на основных тестовых данных
y_pred_ans = rfc.predict(df_mod_ts)

In [1235]:
# результат в файл
with open('answer.txt', 'w') as f:
  for res in list(y_pred_ans):
    f.write(res + '\n')