# 总站光伏分析+异常点处理

In [13]:
import os
import pandas as pd
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

def plot_combined_data(combined_df, title="Combined Power vs GHI_2"):
    fig = px.scatter(
        combined_df, 
        x='GHI_2', 
        y='Power', 
        color= None,  # No color distinction for outliers
        labels={"GHI_2": "GHI_2", "Power": "Power"},
        title=title,
        opacity=0.6,
        trendline="ols",
        trendline_color_override='red'
    )
    fig.update_traces(marker=dict(size=5), selector=dict(mode='markers'))
    fig.update_layout(
        title_x=0.5,
        title_font_size=20,
        xaxis=dict(title="GHI_2", showgrid=True, zeroline=False),
        yaxis=dict(title="Power", showgrid=True, zeroline=False)
    )
    fig.show()

def plot_by_quarters(df):
    fig = make_subplots(rows=2, cols=2, subplot_titles=("Q1", "Q2", "Q3", "Q4"), 
                        x_title="GHI_2", y_title="Power")

    quarters = {1: "Q1", 2: "Q2", 3: "Q3", 4: "Q4"}
    
    for quarter, name in quarters.items():
        quarter_df = df[df['Quarter'] == quarter]
        fig.add_trace(
            go.Scatter(
                x=quarter_df['GHI_2'], 
                y=quarter_df['Power'], 
                mode='markers', 
                marker=dict(size=5, opacity=0.6), 
                name=name
            ),
            row=(quarter-1)//2 + 1, col=(quarter-1)%2 + 1
        )
        
    fig.update_layout(
        height=800, width=800,
        title_text="Quarterly Scatter Plots of Power vs GHI_2",
        showlegend=False
    )
    
    fig.show()

def process_and_plot_by_quarters(pv_dir, gfs_dir, start_date, end_date):
    date_range = pd.date_range(start_date, end_date)
    
    combined_data = {
        "Power": [],
        "GHI_2": [],
        "Date": []
    }

    for date in date_range:
        pv_file_name = f"PV_{date.strftime('%Y-%m-%d')}.csv"
        gfs_file_name = f"lat31.25_lon121.5_{date.strftime('%Y%m%d')}_gfs_15min.csv"

        pv_file_path = os.path.join(pv_dir, pv_file_name)
        gfs_file_path = os.path.join(gfs_dir, gfs_file_name)

        if os.path.exists(pv_file_path) and os.path.exists(gfs_file_path):
            pv_df = pd.read_csv(pv_file_path)
            gfs_df = pd.read_csv(gfs_file_path)

            pv_df["Date"] = pd.to_datetime(pv_df["Date"])
            gfs_df["data_time"] = pd.to_datetime(gfs_df["data_time"])
            
            pv_df = pv_df[(pv_df["Date"].dt.time >= pd.to_datetime("05:00").time()) & (pv_df["Date"].dt.time <= pd.to_datetime("18:00").time())]
            gfs_df = gfs_df[(gfs_df["data_time"].dt.time >= pd.to_datetime("05:00").time()) & (gfs_df["data_time"].dt.time <= pd.to_datetime("18:00").time())]

            if len(pv_df) != len(gfs_df):
                min_len = min(len(pv_df), len(gfs_df))
                pv_df = pv_df.iloc[:min_len]
                gfs_df = gfs_df.iloc[:min_len]

            combined_data["Power"].extend(pv_df["Power"])
            combined_data["GHI_2"].extend(gfs_df["GHI_2"])
            combined_data["Date"].extend(pv_df["Date"])
    
    combined_df = pd.DataFrame(combined_data).dropna()
    combined_df['Date'] = pd.to_datetime(combined_df['Date'])
    combined_df['Quarter'] = combined_df['Date'].dt.quarter

    plot_combined_data(combined_df)

    plot_by_quarters(combined_df)

pv_directory = '/Users/maxz/Desktop/EQUOTA/光伏/processed/PV/全口径光伏'
gfs_directory = '/Users/maxz/Desktop/EQUOTA/XUHUI'
start_date = '2022-06-01'
end_date = '2023-12-31'

process_and_plot_by_quarters(pv_directory, gfs_directory, start_date, end_date)


In [25]:
import os
import pandas as pd
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import numpy as np
from sklearn.ensemble import IsolationForest

def plot_combined_data(combined_df, title="Combined Power vs GHI_2", color_column='Outlier'):
    fig = px.scatter(
        combined_df, 
        x='GHI_2', 
        y='Power', 
        color=color_column,
        color_discrete_map={0: 'blue', 1: 'red'} if color_column == 'Outlier' else None,
        labels={"GHI_2": "GHI_2", "Power": "Power"},
        title=title,
        opacity=0.6,
        trendline="ols"
    )
    fig.update_traces(marker=dict(size=5), selector=dict(mode='markers'))
    fig.update_layout(
        title_x=0.5,
        title_font_size=20,
        xaxis=dict(title="GHI_2", showgrid=True, zeroline=False),
        yaxis=dict(title="Power", showgrid=True, zeroline=False)
    )
    fig.show()

def plot_by_quarters(df):
    fig = make_subplots(rows=2, cols=2, subplot_titles=("Q1", "Q2", "Q3", "Q4"), 
                        x_title="GHI_2", y_title="Power")

    quarters = {1: "Q1", 2: "Q2", 3: "Q3", 4: "Q4"}
    
    for quarter, name in quarters.items():
        quarter_df = df[df['Quarter'] == quarter]
        fig.add_trace(
            go.Scatter(
                x=quarter_df['GHI_2'], 
                y=quarter_df['Power'], 
                mode='markers', 
                marker=dict(size=5, opacity=0.6), 
                name=name
            ),
            row=(quarter-1)//2 + 1, col=(quarter-1)%2 + 1
        )
        
    fig.update_layout(
        height=800, width=800,
        title_text="Quarterly Scatter Plots of Power vs GHI_2",
        showlegend=False
    )
    
    fig.show()

def process_and_plot_by_quarters(pv_dir, gfs_dir, start_date, end_date, output_csv_path):
    date_range = pd.date_range(start_date, end_date)
    
    combined_data = {
        "Power": [],
        "GHI_2": [],
        "Date": []
    }

    for date in date_range:
        pv_file_name = f"PV_{date.strftime('%Y-%m-%d')}.csv"
        gfs_file_name = f"lat31.25_lon121.5_{date.strftime('%Y%m%d')}_gfs_15min.csv"

        pv_file_path = os.path.join(pv_dir, pv_file_name)
        gfs_file_path = os.path.join(gfs_dir, gfs_file_name)

        if os.path.exists(pv_file_path) and os.path.exists(gfs_file_path):
            pv_df = pd.read_csv(pv_file_path)
            gfs_df = pd.read_csv(gfs_file_path)

            pv_df["Date"] = pd.to_datetime(pv_df["Date"])
            gfs_df["data_time"] = pd.to_datetime(gfs_df["data_time"])
            
            pv_df = pv_df[(pv_df["Date"].dt.time >= pd.to_datetime("05:00").time()) & (pv_df["Date"].dt.time <= pd.to_datetime("18:00").time())]
            gfs_df = gfs_df[(gfs_df["data_time"].dt.time >= pd.to_datetime("05:00").time()) & (gfs_df["data_time"].dt.time <= pd.to_datetime("18:00").time())]

            if len(pv_df) != len(gfs_df):
                min_len = min(len(pv_df), len(gfs_df))
                pv_df = pv_df.iloc[:min_len]
                gfs_df = gfs_df.iloc[:min_len]

            combined_data["Power"].extend(pv_df["Power"])
            combined_data["GHI_2"].extend(gfs_df["GHI_2"])
            combined_data["Date"].extend(pv_df["Date"])
    
    combined_df = pd.DataFrame(combined_data).dropna()
    combined_df['Date'] = pd.to_datetime(combined_df['Date'])
    combined_df['Quarter'] = combined_df['Date'].dt.quarter

    # Use Isolation Forest to identify outliers
    X = combined_df[['GHI_2', 'Power']]
    isolation_forest = IsolationForest(n_estimators=200, max_samples='auto', contamination=0.1, max_features=1.0, random_state=42)
    combined_df['Outlier'] = isolation_forest.fit_predict(X)
    combined_df['Outlier'] = combined_df['Outlier'].map({1: 0, -1: 1})  # 1 for outliers, 0 for inliers

    plot_combined_data(combined_df)

    cleaned_df = combined_df[combined_df['Outlier'] == 0]

    plot_by_quarters(cleaned_df)

    combined_df.rename(columns={'Outlier': 'label'}, inplace=True)
    combined_df[['Date', 'Power', 'label']].to_csv(output_csv_path, index=False)

pv_directory = '/Users/maxz/Desktop/EQUOTA/光伏/processed/PV/全口径光伏'
gfs_directory = '/Users/maxz/Desktop/EQUOTA/XUHUI'
start_date = '2022-06-01'
end_date = '2023-12-31'
output_csv_path = '/Users/maxz/Desktop/EQUOTA/combined_outliers_power.csv'

process_and_plot_by_quarters(pv_directory, gfs_directory, start_date, end_date, output_csv_path)


**这边异常点处理我尝试了很多不同方法 这个是我目前运行下来最符合预期趋势的方法，主要瑕疵就是右上角的部分可能多切掉了一些**