In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

file_path = 'files/primary-screen-cell-line-info.csv'
data = pd.read_csv(file_path)

print(data.head())

print(data.isnull().sum())

data_cleaned = data.dropna(subset=['primary_tissue', 'secondary_tissue'])

data_cleaned['passed_str_profiling'] = data_cleaned['passed_str_profiling'].astype(int)

print(data_cleaned.head())

sns.set(style="whitegrid")

# Plot 1: Bar plot for the distribution of primary tissues
plt.figure(figsize=(12, 6))
sns.countplot(y=data_cleaned['primary_tissue'], order=data_cleaned['primary_tissue'].value_counts().index)
plt.title('Distribution of Primary Tissues')
plt.xlabel('Count')
plt.ylabel('Primary Tissue')
plt.show()

# Plot 2: Bar plot for the distribution of secondary tissues
plt.figure(figsize=(12, 6))
sns.countplot(y=data_cleaned['secondary_tissue'], order=data_cleaned['secondary_tissue'].value_counts().index)
plt.title('Distribution of Secondary Tissues')
plt.xlabel('Count')
plt.ylabel('Secondary Tissue')
plt.show()

# Plot 4: Stacked Bar Plot for Primary and Secondary Tissues
plt.figure(figsize=(14, 8))
primary_secondary_counts = data_cleaned.groupby(['primary_tissue', 'secondary_tissue']).size().unstack().fillna(0)
primary_secondary_counts.plot(kind='bar', stacked=True, figsize=(14, 8), colormap='viridis')
plt.title('Stacked Bar Plot of Primary and Secondary Tissues')
plt.xlabel('Primary Tissue')
plt.ylabel('Count')
plt.legend(title='Secondary Tissue')
plt.show()

# Plot 5: Heatmap of Primary vs. Secondary Tissues
plt.figure(figsize=(14, 10))
sns.heatmap(primary_secondary_counts, cmap='viridis', annot=True, fmt='g')
plt.title('Heatmap of Primary vs. Secondary Tissues')
plt.xlabel('Secondary Tissue')
plt.ylabel('Primary Tissue')
plt.show()

# Plot 6: Pie Chart for Primary Tissues
plt.figure(figsize=(10, 10))
data_cleaned['primary_tissue'].value_counts().plot.pie(autopct='%1.1f%%', colors=sns.color_palette('viridis', n_colors=len(data_cleaned['primary_tissue'].unique())))
plt.title('Proportion of Primary Tissues')
plt.ylabel('')
plt.show()


In [None]:
import pandas as pd

file_path = 'files/oncokb_biomarker_drug_associations.tsv'

data = pd.read_csv(file_path, sep='\t')

print(data)

In [None]:
import pandas as pd

file_path = 'files/oncokb_biomarker_drug_associations.tsv'

data = pd.read_csv(file_path, sep='\t')

filtered_data = data[data['Cancer Types'] == 'Melanoma']

print(filtered_data)

In [None]:
import pandas as pd

file_path = 'files/Repurposing_Public_24Q2_Cell_Line_Meta_Data.csv'

data = pd.read_csv(file_path, sep=',')

filtered_data = data[data['ccle_name'].str.contains('SKIN', case=False, na=False)]

pd.set_option('display.max_rows', None)

print(filtered_data)

In [None]:
import pandas as pd

file1 = pd.read_csv('files/Repurposing_Public_24Q2_Cell_Line_Meta_Data.csv', sep=',')
file2 = pd.read_csv('files/Repurposing_Public_24Q2_LFC_COLLAPSED.csv', sep=',')

mapping = file1[['row_id', 'ccle_name']].drop_duplicates()

row_id_to_ccle_name = dict(zip(mapping['row_id'], mapping['ccle_name']))

file2['ccle_name'] = file2['row_id'].map(row_id_to_ccle_name)

current_columns = file2.columns.tolist()

current_columns = ['ccle_name'] + [col for col in current_columns if col != 'ccle_name']

file2 = file2[current_columns]

file2.to_csv('files/FinalizedData.csv', index=False)