In [23]:
import warnings
warnings.filterwarnings("ignore")
import logging
logging.basicConfig(level=logging.ERROR)

In [27]:
import requests
import pandas as pd
from tqdm import tqdm
from io import StringIO

def parse_format_pre_1977(line):
    return {
        'Year': int(line[0:4]),
        'Month': int(line[4:6]),
        'Day': int(line[6:8]),
        'Time': float(line[8:12]),
        'Greenwich sunspot group': int(line[12:20]),
        'Observed umbral area': float(line[25:30]),
        'Observed whole spot area': float(line[30:35]),
        'Corrected umbral area': float(line[35:40]),
        'Corrected whole spot area': float(line[40:45]),
        'Distance from center of solar disk': float(line[45:51]),
        'Position angle from heliographic north': float(line[51:57]),
        'Carrington Longitude': float(line[57:63]),
        'Latitude': float(line[63:69]),
        'Central meridian distance': float(line[69:75])
    }

def parse_format_1977_to_1982(line):
    return {
        'Year': int(line[0:4]),
        'Month': int(line[4:6]),
        'Day': int(line[6:8]),
        'Time': float(line[8:12]),
        'NOAA/USAF group': int(line[12:20]),
        'Greenwich Group type': line[22:24].strip(),  # Changed to string
        'Zurich/McIntosh group type': line[25:30].strip(),
        'Observed whole spot area': float(line[30:35]),
        'Number of spots in group': int(line[35:40]),
        'Corrected whole spot area': float(line[40:45]),
        'Distance from center of solar disk': float(line[45:51]),
        'Position angle from heliographic north': float(line[51:57]),
        'Carrington Longitude': float(line[57:63]),
        'Latitude': float(line[63:69]),
        'Central meridian distance': float(line[69:75])
    }

def parse_format_post_1982(line):
    return {
        'Year': int(line[0:4]),
        'Month': int(line[4:6]),
        'Day': int(line[6:8]),
        'Time': float(line[8:12]),
        'NOAA/USAF group': int(line[12:20]),
        'Suffix to group number': line[20:21],
        'Magnetic group type': line[21:24],
        'Zurich/McIntosh group type': line[25:30],
        'Observed whole spot area': float(line[30:35]),
        'Number of spots in group': int(line[35:40]),
        'Corrected whole spot area': float(line[40:45]),
        'Distance from center of solar disk': float(line[45:51]),
        'Position angle from heliographic north': float(line[51:57]),
        'Carrington Longitude': float(line[57:63]),
        'Latitude': float(line[63:69]),
        'Central meridian distance': float(line[69:75])
    }

data_dir='D:\\_NILESH\\dissertation\\data\\'
format_url='http://solarcyclescience.com/AR_Database/format.txt'
data_url_template = 'http://solarcyclescience.com/AR_Database/g{year}.txt'

# Create an empty DataFrame to store all data
all_data = pd.DataFrame()

# Iterate over the years
for year in tqdm(range(1975, 2024)):
    # Get the data
    data_url = data_url_template.format(year=year)
    data = requests.get(data_url).text
    
    # Split the data into lines
    lines = data.split('\n')

    # Iterate over the lines in the data
    for line in lines:
        # Skip empty lines or lines with only spaces
        if line.strip() == '':
            continue

        # Depending on the year, parse the line differently
        try:
            if year < 1977:
                row = parse_format_pre_1977(line)
            elif year < 1982:
                row = parse_format_1977_to_1982(line)
            else:
                row = parse_format_post_1982(line)
            
            # Append the parsed row to the all_data DataFrame
            all_data = all_data.append(row, ignore_index=True)
        except ValueError:
            print(f"Skipping line due to parsing error: {line}")

# Save all data to a CSV file
all_data.to_csv(data_dir + 'sunspots.csv', index=False)

  8%|▊         | 4/49 [00:15<04:15,  5.67s/it]

Skipping line due to parsing error: 197912 3.000         0 0    0    0    0    0 0.000   0.0   0.0   0.0   0.0


100%|██████████| 49/49 [10:15<00:00, 12.56s/it]


In [30]:
sun=pd.read_csv(data_dir+'sunspots.csv')
sun

Unnamed: 0,Year,Month,Day,Time,Greenwich sunspot group,Observed umbral area,Observed whole spot area,Corrected umbral area,Corrected whole spot area,Distance from center of solar disk,Position angle from heliographic north,Carrington Longitude,Latitude,Central meridian distance,Greenwich Group type,NOAA/USAF group,Number of spots in group,Zurich/McIntosh group type,Magnetic group type,Suffix to group number
0,1975.0,1.0,1.0,0.427,23606.0,39.0,302.0,25.0,191.0,0.610,100.2,243.2,-8.8,-37.4,,,,,,
1,1975.0,1.0,1.0,0.427,23607.0,10.0,105.0,10.0,103.0,0.858,281.0,338.8,7.7,58.2,,,,,,
2,1975.0,1.0,2.0,0.533,23606.0,20.0,167.0,11.0,90.0,0.361,105.8,245.4,-8.8,-20.6,,,,,,
3,1975.0,1.0,2.0,0.533,23607.0,5.0,32.0,9.0,50.0,0.942,279.5,335.7,7.8,69.7,,,,,,
4,1975.0,1.0,2.0,0.533,23608.0,2.0,8.0,3.0,11.0,0.927,85.0,198.3,3.4,-67.7,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90813,2023.0,7.0,4.0,0.000,,,19.0,,10.0,0.207,210.1,103.0,-7.0,6.0,,13356.0,2.0,Bxo,B,
90814,2023.0,7.0,4.0,0.000,,,98.0,,50.0,0.199,154.2,92.0,-7.0,-5.0,,13357.0,6.0,Dao,B,
90815,2023.0,7.0,4.0,0.000,,,167.0,,100.0,0.546,120.1,68.0,-13.0,-29.0,,13358.0,14.0,Dai,B,
90816,2023.0,7.0,4.0,0.000,,,129.0,,80.0,0.586,136.1,71.0,-22.0,-26.0,,13359.0,8.0,Dso,B,


In [29]:
sun.dtypes

Year                                      float64
Month                                     float64
Day                                       float64
Time                                      float64
Greenwich sunspot group                   float64
Observed umbral area                      float64
Observed whole spot area                  float64
Corrected umbral area                     float64
Corrected whole spot area                 float64
Distance from center of solar disk        float64
Position angle from heliographic north    float64
Carrington Longitude                      float64
Latitude                                  float64
Central meridian distance                 float64
Greenwich Group type                       object
NOAA/USAF group                           float64
Number of spots in group                  float64
Zurich/McIntosh group type                 object
Magnetic group type                        object
Suffix to group number                     object
