In [1]:
import numpy as np
import torch

In [2]:
# Import additional necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
#from transformers import AutoModel, AutoTokenizer
from matminer.datasets import load_dataset

# Set random seeds for reproducibility
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x1b1ccb59ff0>

In [None]:
# Load superconductor dataset from matminer
# This dataset contains superconducting critical temperatures and material compositions
try:
    df = load_dataset("superconductivity")
    print(f"Successfully loaded superconductor dataset with {len(df)} entries")
    df.head()
except Exception as e:
    print(f"Error loading matminer dataset: {e}")
    print("Attempting to load alternative dataset...")

In [None]:
# Alternative: Load dataset from SuperCon database
url = 'https://supercon.nims.go.jp/supercon/export_csv/export_sc_tc.csv'
try:
    df = pd.read_csv(url)
    print(f"Successfully loaded SuperCon dataset with {len(df)} entries")
    df.head()
except Exception as e:
    print(f"Error loading alternative dataset: {e}")
    print("Installing and using matbench_superconductivity dataset...")

In [None]:
# If both fail, try using matbench dataset
!pip install matbench
from matbench.bench import MatbenchBenchmark
mb = MatbenchBenchmark()
df = mb.get_dataset("matbench_supercond")
print(f"Successfully loaded matbench_supercond dataset with {len(df)} entries")
df.head()

In [None]:
# Explore the dataset
print("Dataset information:")
print(df.info())
print("\nDataset statistics:")
print(df.describe())

# Visualize T_c distribution
plt.figure(figsize=(10, 6))
plt.hist(df['Tc'], bins=50)
plt.xlabel('Critical Temperature (K)')
plt.ylabel('Frequency')
plt.title('Distribution of Superconductor Critical Temperatures')
plt.show()