In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)

In [2]:
df = pd.read_parquet("tf_gene_expression.parquet")

In [3]:
df.shape

(1209552, 3)

In [4]:
df.head()

Unnamed: 0,tf_name,gene_name,expression
0,AATF,DPM1,-0.124948
1,AATF,CFH,0.016431
2,AATF,FUCA2,-0.104911
3,AATF,NIPAL3,-0.075916
4,AATF,LAS1L,-0.039633


In [5]:
df.query("expression > 0").shape

(476762, 3)

In [6]:
df.query("expression < 0").shape

(732790, 3)

In [7]:
df["expression_label"] = pd.qcut(df['expression'], q=3, labels=[0, 1, 2])

In [8]:
df["expression_label"].value_counts()

expression_label
0    403184
1    403184
2    403184
Name: count, dtype: int64

In [9]:
df[df["expression_label"] == 0]["expression"].min(), df[df["expression_label"] == 0]["expression"].max()

(np.float64(-1.978837102651596), np.float64(-0.06921310629695654))

In [10]:
df[df["expression_label"] == 1]["expression"].min(), df[df["expression_label"] == 1]["expression"].max()

(np.float64(-0.06921300292015076), np.float64(0.016506717540323734))

In [11]:
df[df["expression_label"] == 2]["expression"].min(), df[df["expression_label"] == 2]["expression"].max()

(np.float64(0.016507630236446857), np.float64(30.954130172729492))

In [65]:
positive_threshold = 0.5
negative_threshold = -0.45

In [66]:
df.query("expression > @negative_threshold/3 and expression < @positive_threshold/3").shape

(995508, 4)

In [67]:
df.query("expression >= @positive_threshold").shape

(5618, 4)

In [68]:
df.query("expression <= @negative_threshold").shape

(5267, 4)

In [69]:
df["expression_label"] = -1

In [70]:
df.loc[df['expression'] <= negative_threshold, 'expression_label'] = 0

In [71]:
df.loc[(negative_threshold/3 < df['expression']) & (df['expression'] < positive_threshold/3), 'expression_label'] = 1

In [72]:
df.loc[df['expression'] >= positive_threshold, 'expression_label'] = 2

In [73]:
df["expression_label"].value_counts()

expression_label
 1    995508
-1    203159
 2      5618
 0      5267
Name: count, dtype: int64

In [74]:
df.query("expression_label != -1").to_parquet("tf_gene_expression_labeled_v2.parquet", index=False)

In [95]:
df.head()

Unnamed: 0,tf_name,gene_name,expression,expression_label
0,AATF,DPM1,-0.124948,1
1,AATF,CFH,0.016431,1
2,AATF,FUCA2,-0.104911,1
3,AATF,NIPAL3,-0.075916,1
4,AATF,LAS1L,-0.039633,1


In [None]:
counts = df['expression_label'].value_counts()
majority_class = counts.idxmax()
majority_count = counts.max()

In [88]:
counts

expression_label
1    1191501
2       9538
0       8513
Name: count, dtype: int64

In [89]:
majority_class

np.int64(1)

In [92]:
df["expression_label"].unique().tolist()

[1, 2, 0]

In [94]:
counts.index[counts.index != 1]

CategoricalIndex([2, 0], categories=[0, 1, 2], ordered=True, dtype='category', name='expression_label')