In [5]:
import numpy as np
from datasets import load_from_disk
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import pandas as pd
from tqdm import tqdm

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [35]:
tasks=['Length', 'Depth', 'CoordinationInversion', 'BigramShift', 'OddManOut', 'Tense', 'TopConstituents', 'WordContent', 'ObjNumber', 'SubjNumber']
outlier_dimensions = [61, 77, 82, 97, 217, 219, 240, 330, 361, 453, 494, 496, 498, 551, 570, 588, 656, 731, 749]

In [6]:
disk_path = "drive/MyDrive"
zip_path1 = '/content/drive/MyDrive/probing.zip'
zip_path2 = '/content/drive/MyDrive/probing_numbers.zip'
data_path = "."

In [7]:
! unzip $zip_path1 -d  $data_path

Archive:  /content/drive/MyDrive/probing.zip
   creating: ./BigramShift.hf/
  inflating: ./BigramShift.hf/dataset_dict.json  
   creating: ./BigramShift.hf/test/
   creating: ./BigramShift.hf/train/
   creating: ./BigramShift.hf/dev/
  inflating: ./BigramShift.hf/test/state.json  
  inflating: ./BigramShift.hf/test/dataset_info.json  
  inflating: ./BigramShift.hf/test/data-00000-of-00001.arrow  
  inflating: ./BigramShift.hf/train/state.json  
  inflating: ./BigramShift.hf/train/dataset_info.json  
  inflating: ./BigramShift.hf/train/data-00000-of-00001.arrow  
  inflating: ./BigramShift.hf/dev/state.json  
  inflating: ./BigramShift.hf/dev/dataset_info.json  
  inflating: ./BigramShift.hf/dev/data-00000-of-00001.arrow  
   creating: ./CoordinationInversion.hf/
  inflating: ./CoordinationInversion.hf/.DS_Store  
  inflating: ./__MACOSX/CoordinationInversion.hf/._.DS_Store  
  inflating: ./CoordinationInversion.hf/dataset_dict.json  
  inflating: ./__MACOSX/CoordinationInversion.hf/._d

In [8]:
! unzip $zip_path2 -d  $data_path

Archive:  /content/drive/MyDrive/probing_numbers.zip
   creating: ./ObjNumber.hf/
  inflating: ./ObjNumber.hf/dataset_dict.json  
   creating: ./ObjNumber.hf/test/
   creating: ./ObjNumber.hf/train/
   creating: ./ObjNumber.hf/dev/
  inflating: ./ObjNumber.hf/test/state.json  
  inflating: ./ObjNumber.hf/test/dataset_info.json  
  inflating: ./ObjNumber.hf/test/data-00000-of-00001.arrow  
  inflating: ./ObjNumber.hf/train/state.json  
  inflating: ./ObjNumber.hf/train/dataset_info.json  
  inflating: ./ObjNumber.hf/train/data-00000-of-00001.arrow  
  inflating: ./ObjNumber.hf/dev/state.json  
  inflating: ./ObjNumber.hf/dev/dataset_info.json  
  inflating: ./ObjNumber.hf/dev/data-00000-of-00001.arrow  
   creating: ./SubjNumber.hf/
  inflating: ./SubjNumber.hf/.DS_Store  
  inflating: ./__MACOSX/SubjNumber.hf/._.DS_Store  
  inflating: ./SubjNumber.hf/dataset_dict.json  
   creating: ./SubjNumber.hf/test/
   creating: ./SubjNumber.hf/train/
   creating: ./SubjNumber.hf/dev/
  inflating

In [8]:
def one_feat_classif(task, n_features=768):
  d=[]
  embed = load_from_disk('/content/'+task+'.hf')
  X_train_full=np.array(embed['train']['X'])
  y_train=np.array(embed['train']['y'])
  X_test_full=np.array(embed['test']['X'])
  y_test=np.array(embed['test']['y'])

  for ind in tqdm(range(n_features)):
    model=LogisticRegression()
    model.fit(X_train_full[:,ind].reshape(-1, 1), y_train)
    y_pred=model.predict(X_test_full[:,ind].reshape(-1,1))
    acc=accuracy_score(y_test, y_pred)
    d.append(acc)
  d=np.array(d)
  return d


In [9]:
acc_objnumber=one_feat_classif('ObjNumber')

100%|██████████| 768/768 [00:59<00:00, 12.97it/s]


In [10]:
np.save('accuracy_ObjNumber', acc_objnumber)

In [12]:
acc_subjnumber=one_feat_classif('SubjNumber')

100%|██████████| 768/768 [01:20<00:00,  9.56it/s]


In [13]:
np.save('accuracy_SubjNumber', acc_subjnumber)

In [14]:
acc_length=one_feat_classif('Length')

100%|██████████| 768/768 [06:06<00:00,  2.10it/s]


In [15]:
np.save('accuracy_Length', acc_length)

In [16]:
for task in tasks[1:-3]:
  acc_task=one_feat_classif(task)
  np.save('accuracy_'+task, acc_task)

100%|██████████| 768/768 [16:51<00:00,  1.32s/it]
100%|██████████| 768/768 [00:55<00:00, 13.72it/s]
100%|██████████| 768/768 [01:01<00:00, 12.58it/s]
100%|██████████| 768/768 [01:18<00:00,  9.74it/s]
100%|██████████| 768/768 [01:06<00:00, 11.50it/s]
100%|██████████| 768/768 [18:14<00:00,  1.43s/it]


In [7]:
#random features
num_features=768
rng=np.random.default_rng(seed=23)
rand_index=rng.integers(0,num_features,size=20)

In [36]:
df={}
rand_class=[]

In [37]:
for task in tasks:
  d=[]
  embed = load_from_disk('/content/'+task+'.hf')
  X_train_full=np.array(embed['train']['X'])
  y_train=np.array(embed['train']['y'])
  X_test_full=np.array(embed['test']['X'])
  y_test=np.array(embed['test']['y'])
  rand_class.append(1/len(np.unique(y_train)))
  s_acc=0
  for ind in rand_index:
    model=LogisticRegression()
    model.fit(X_train_full[:,ind].reshape(-1, 1), y_train)
    y_pred=model.predict(X_test_full[:,ind].reshape(-1,1))
    acc=accuracy_score(y_pred,y_test)
    s_acc+=acc
  mean_acc=s_acc/20
  d.append(mean_acc)

  for od in outlier_dimensions:
    model=LogisticRegression()
    X_tr=X_train_full[:,od]
    model.fit(X_tr.reshape(-1, 1), y_train)
    y_pred=model.predict(X_test_full[:,od].reshape(-1,1))
    acc=accuracy_score(y_pred,y_test)
    d.append(acc)

  df[task]=d

In [38]:
a=['random features']
a.extend(outlier_dimensions)
table = pd.DataFrame(data=df, index=a)

In [39]:
table.loc['random classification']=rand_class

In [40]:
table

Unnamed: 0,Length,Depth,CoordinationInversion,BigramShift,OddManOut,Tense,TopConstituents,WordContent,ObjNumber,SubjNumber
random features,0.190411,0.18064,0.513462,0.55564,0.51775,0.53609,0.0693,0.00191,0.527415,0.527845
61,0.214186,0.1817,0.512597,0.5196,0.5126,0.5816,0.0712,0.0023,0.5047,0.4873
77,0.438375,0.177,0.518896,0.5732,0.5449,0.5636,0.0904,0.0017,0.5074,0.5115
82,0.264306,0.1823,0.533993,0.6804,0.5727,0.5025,0.083,0.0015,0.5356,0.5094
97,0.203882,0.1763,0.522496,0.6254,0.528,0.6694,0.0767,0.0024,0.566,0.5801
217,0.281513,0.19,0.521596,0.5092,0.5183,0.5711,0.079,0.0016,0.5138,0.5122
219,0.298719,0.2026,0.523295,0.526,0.5023,0.5065,0.0773,0.002,0.5292,0.5084
240,0.236595,0.1752,0.492302,0.521,0.5068,0.4836,0.0697,0.0018,0.5007,0.4881
330,0.216587,0.1827,0.5016,0.5043,0.5199,0.493,0.0794,0.0022,0.5158,0.5257
361,0.204582,0.182,0.506599,0.5492,0.5089,0.5477,0.065,0.0016,0.5197,0.5292


Error: Runtime no longer has a reference to this dataframe, please re-run this cell and try again.


In [62]:
table.to_csv('1featclassif.csv')

NameError: name 'table' is not defined

In [76]:
import pandas as pd
import numpy as np
table1= pd.read_csv('1featclassif.csv')

In [77]:
table1

Unnamed: 0.1,Unnamed: 0,Length,Depth,CoordinationInversion,BigramShift,OddManOut,Tense,TopConstituents,WordContent,ObjNumber,SubjNumber
0,random features,0.190411,0.18064,0.513462,0.55564,0.51775,0.53609,0.0693,0.00191,0.527415,0.527845
1,61,0.214186,0.1817,0.512597,0.5196,0.5126,0.5816,0.0712,0.0023,0.5047,0.4873
2,77,0.438375,0.177,0.518896,0.5732,0.5449,0.5636,0.0904,0.0017,0.5074,0.5115
3,82,0.264306,0.1823,0.533993,0.6804,0.5727,0.5025,0.083,0.0015,0.5356,0.5094
4,97,0.203882,0.1763,0.522496,0.6254,0.528,0.6694,0.0767,0.0024,0.566,0.5801
5,217,0.281513,0.19,0.521596,0.5092,0.5183,0.5711,0.079,0.0016,0.5138,0.5122
6,219,0.298719,0.2026,0.523295,0.526,0.5023,0.5065,0.0773,0.002,0.5292,0.5084
7,240,0.236595,0.1752,0.492302,0.521,0.5068,0.4836,0.0697,0.0018,0.5007,0.4881
8,330,0.216587,0.1827,0.5016,0.5043,0.5199,0.493,0.0794,0.0022,0.5158,0.5257
9,361,0.204582,0.182,0.506599,0.5492,0.5089,0.5477,0.065,0.0016,0.5197,0.5292


In [64]:
table1['TopConstituents']=np.round(table1['TopConstituents'], 3)
table1['WordContent']=np.round(table1['WordContent'], 4)
table1[table1.columns.drop(['TopConstituents', 'WordContent', 'Unnamed: 0'])]=np.round(table1[table1.columns.drop(['TopConstituents', 'WordContent', 'Unnamed: 0'])],2)
table1=table1.T
table1.columns=table1.loc['Unnamed: 0']
table1=table1.drop(['Unnamed: 0'], axis=0)


In [65]:
table1.iloc[:,:11]

Unnamed: 0,random features,61,77,82,97,217,219,240,330,361,453
Length,0.19,0.21,0.44,0.26,0.2,0.28,0.3,0.24,0.22,0.2,0.23
Depth,0.18,0.18,0.18,0.18,0.18,0.19,0.2,0.18,0.18,0.18,0.18
CoordinationInversion,0.51,0.51,0.52,0.53,0.52,0.52,0.52,0.49,0.5,0.51,0.51
BigramShift,0.56,0.52,0.57,0.68,0.63,0.51,0.53,0.52,0.5,0.55,0.54
OddManOut,0.52,0.51,0.54,0.57,0.53,0.52,0.5,0.51,0.52,0.51,0.53
Tense,0.54,0.58,0.56,0.5,0.67,0.57,0.51,0.48,0.49,0.55,0.5
TopConstituents,0.069,0.071,0.09,0.083,0.077,0.079,0.077,0.07,0.079,0.065,0.072
WordContent,0.0019,0.0023,0.0017,0.0015,0.0024,0.0016,0.002,0.0018,0.0022,0.0016,0.0026
ObjNumber,0.53,0.5,0.51,0.54,0.57,0.51,0.53,0.5,0.52,0.52,0.55
SubjNumber,0.53,0.49,0.51,0.51,0.58,0.51,0.51,0.49,0.53,0.53,0.57


In [66]:
table1.iloc[:,11:]

Unnamed: 0,494,496,498,551,570,588,656,731,749,random classification
Length,0.27,0.22,0.23,0.29,0.19,0.21,0.21,0.2,0.23,0.17
Depth,0.18,0.2,0.19,0.18,0.18,0.18,0.18,0.19,0.18,0.14
CoordinationInversion,0.54,0.49,0.51,0.51,0.5,0.51,0.5,0.52,0.5,0.5
BigramShift,0.57,0.53,0.54,0.51,0.55,0.54,0.52,0.57,0.61,0.5
OddManOut,0.53,0.51,0.51,0.52,0.52,0.53,0.5,0.51,0.53,0.5
Tense,0.51,0.54,0.61,0.57,0.57,0.6,0.49,0.54,0.64,0.5
TopConstituents,0.072,0.08,0.094,0.072,0.066,0.095,0.068,0.079,0.068,0.05
WordContent,0.0015,0.0024,0.0025,0.0019,0.0019,0.0026,0.0019,0.0018,0.0019,0.001
ObjNumber,0.52,0.51,0.54,0.52,0.6,0.53,0.49,0.54,0.57,0.5
SubjNumber,0.52,0.51,0.47,0.54,0.57,0.5,0.49,0.55,0.57,0.5


In [69]:
table1.iloc[:,-1].T

Length                    0.17
Depth                     0.14
CoordinationInversion      0.5
BigramShift                0.5
OddManOut                  0.5
Tense                      0.5
TopConstituents           0.05
WordContent              0.001
ObjNumber                  0.5
SubjNumber                 0.5
Name: random classification, dtype: object

In [67]:
table1

Unnamed: 0,random features,61,77,82,97,217,219,240,330,361,...,494,496,498,551,570,588,656,731,749,random classification
Length,0.19,0.21,0.44,0.26,0.2,0.28,0.3,0.24,0.22,0.2,...,0.27,0.22,0.23,0.29,0.19,0.21,0.21,0.2,0.23,0.17
Depth,0.18,0.18,0.18,0.18,0.18,0.19,0.2,0.18,0.18,0.18,...,0.18,0.2,0.19,0.18,0.18,0.18,0.18,0.19,0.18,0.14
CoordinationInversion,0.51,0.51,0.52,0.53,0.52,0.52,0.52,0.49,0.5,0.51,...,0.54,0.49,0.51,0.51,0.5,0.51,0.5,0.52,0.5,0.5
BigramShift,0.56,0.52,0.57,0.68,0.63,0.51,0.53,0.52,0.5,0.55,...,0.57,0.53,0.54,0.51,0.55,0.54,0.52,0.57,0.61,0.5
OddManOut,0.52,0.51,0.54,0.57,0.53,0.52,0.5,0.51,0.52,0.51,...,0.53,0.51,0.51,0.52,0.52,0.53,0.5,0.51,0.53,0.5
Tense,0.54,0.58,0.56,0.5,0.67,0.57,0.51,0.48,0.49,0.55,...,0.51,0.54,0.61,0.57,0.57,0.6,0.49,0.54,0.64,0.5
TopConstituents,0.069,0.071,0.09,0.083,0.077,0.079,0.077,0.07,0.079,0.065,...,0.072,0.08,0.094,0.072,0.066,0.095,0.068,0.079,0.068,0.05
WordContent,0.0019,0.0023,0.0017,0.0015,0.0024,0.0016,0.002,0.0018,0.0022,0.0016,...,0.0015,0.0024,0.0025,0.0019,0.0019,0.0026,0.0019,0.0018,0.0019,0.001
ObjNumber,0.53,0.5,0.51,0.54,0.57,0.51,0.53,0.5,0.52,0.52,...,0.52,0.51,0.54,0.52,0.6,0.53,0.49,0.54,0.57,0.5
SubjNumber,0.53,0.49,0.51,0.51,0.58,0.51,0.51,0.49,0.53,0.53,...,0.52,0.51,0.47,0.54,0.57,0.5,0.49,0.55,0.57,0.5




In [59]:
df=pd.DataFrame()
for task in tasks:
  if task!='WordContent':
    data=np.load('accuracy_'+task+'.npy')
    df['Accuracy'+task]=np.sort(data)[::-1][:10]
    df['Index'+task]=np.argsort(data)[::-1][:10]


In [60]:
df

Unnamed: 0,AccuracyLength,IndexLength,AccuracyDepth,IndexDepth,AccuracyCoordinationInversion,IndexCoordinationInversion,AccuracyBigramShift,IndexBigramShift,AccuracyOddManOut,IndexOddManOut,AccuracyTense,IndexTense,AccuracyTopConstituents,IndexTopConstituents,AccuracyObjNumber,IndexObjNumber,AccuracySubjNumber,IndexSubjNumber
0,0.438375,77,0.2045,734,0.54849,248,0.6804,82,0.5727,82,0.6929,709,0.1006,16,0.5999,570,0.6258,154
1,0.298719,219,0.2034,180,0.542392,666,0.6377,477,0.5645,331,0.6827,587,0.0946,588,0.5977,154,0.6257,530
2,0.293117,551,0.2033,74,0.541292,69,0.6254,97,0.5522,248,0.6694,97,0.0944,498,0.5959,379,0.6046,612
3,0.282813,38,0.2026,219,0.539092,494,0.6223,666,0.5449,77,0.6476,286,0.0942,624,0.5931,93,0.6024,327
4,0.281713,144,0.2023,397,0.537493,176,0.6215,662,0.5416,69,0.6393,44,0.0904,77,0.5927,739,0.5966,689
5,0.281513,217,0.201,539,0.536193,516,0.6212,155,0.5414,611,0.6386,749,0.0887,179,0.59,760,0.595,379
6,0.268808,664,0.1998,133,0.535993,538,0.6182,157,0.5405,342,0.638,690,0.0886,81,0.5895,443,0.5912,491
7,0.266006,494,0.198,185,0.535993,729,0.6141,342,0.5393,420,0.6374,471,0.0874,138,0.5856,327,0.5908,485
8,0.264306,82,0.197,245,0.535193,332,0.6136,248,0.5378,157,0.6341,166,0.0869,384,0.5824,530,0.5895,79
9,0.262605,331,0.1964,514,0.535093,360,0.6126,604,0.5363,662,0.6328,438,0.0865,331,0.5802,698,0.5882,634


In [61]:
df.to_csv('1featclassif_all.csv')