In [1]:
import pandas as pd
from scipy.stats import spearmanr

# --- Configuration ---
# Set the significance level (standard alpha value)
ALPHA = 0.05 

# 1. Load the final data file
df = pd.read_csv('final_country_data.csv')

# 2. Select the two variables for correlation
gdp = df['Average_GDP_Last_10_Years_Billion_USD']
frequency = df['Frequency']

# 3. Perform Spearman's Rank Correlation Test
rho, p_value = spearmanr(gdp, frequency)

# --- Output and Rejection Decision ---
print(f"--- Spearman's Rank Correlation Test Results ---")
print(f"Correlation Coefficient (Rho): {rho:.3f}")
print(f"P-value: {p_value:.5f}")
print(f"Significance Level (Alpha): {ALPHA}")
print("-" * 50)

# 4. Check for rejection of the Null Hypothesis
if p_value < ALPHA:
    rejection_statement = "P-value is less than Alpha."
    rejection_decision = "Decision: REJECT the Null Hypothesis (H0)."
    conclusion = "Conclusion: There IS a statistically significant relationship between GDP and news Frequency."
else:
    rejection_statement = "P-value is greater than Alpha."
    rejection_decision = "Decision: FAIL TO REJECT the Null Hypothesis (H0)."
    conclusion = "Conclusion: We do not have sufficient evidence to claim a relationship between GDP and news Frequency."

print(rejection_statement)
print(rejection_decision)
print(conclusion)

--- Spearman's Rank Correlation Test Results ---
Correlation Coefficient (Rho): 0.657
P-value: 0.00000
Significance Level (Alpha): 0.05
--------------------------------------------------
P-value is less than Alpha.
Decision: REJECT the Null Hypothesis (H0).
Conclusion: There IS a statistically significant relationship between GDP and news Frequency.


In [9]:
import pandas as pd
from scipy.stats import ttest_ind

# 1. Load the data
df = pd.read_csv('final_country_data.csv')

# 2. Sort the data by GDP to prepare for the split
df_sorted = df.sort_values(by='Average_GDP_Last_10_Years_Billion_USD', ascending=False).reset_index(drop=True)

# 3. Split the sorted data exactly in half (median split)
N = len(df_sorted)
midpoint = N // 2

group_high_gdp_freq = df_sorted.head(midpoint)['Frequency']
group_low_gdp_freq = df_sorted.tail(midpoint)['Frequency']

# 4. Perform Welch's t-test (Difference of Means)
t_statistic, p_value = ttest_ind(group_high_gdp_freq, 
                                 group_low_gdp_freq, 
                                 equal_var=False)

# 5. --- CALCULATE AND PRINT DEGREES OF FREEDOM (DF) ---
n1 = len(group_high_gdp_freq)
n2 = len(group_low_gdp_freq)

# Calculate Sample Variances (s^2)
s1_sq = group_high_gdp_freq.var(ddof=1)
s2_sq = group_low_gdp_freq.var(ddof=1)

# Calculate the components of the Welchâ€“Satterthwaite equation
numerator = (s1_sq / n1 + s2_sq / n2)**2
denominator = (s1_sq / n1)**2 / (n1 - 1) + (s2_sq / n2)**2 / (n2 - 1)

df_welch = numerator / denominator
# --------------------------------------------------------

# 6. Output the results
print("\n" + "=" * 50)
print("  RESULTS OF WELCH'S T-TEST (HIGH GDP vs LOW GDP)  ")
print("=" * 50)
print(f"Group Sizes: High GDP (n={n1}), Low GDP (n={n2})")
print(f"Mean Frequency - High GDP: {group_high_gdp_freq.mean():.2f}")
print(f"Mean Frequency - Low GDP: {group_low_gdp_freq.mean():.2f}")
print("-" * 50)
print(f"T-Statistic: {t_statistic:.3f}")
print(f"P-value: {p_value:.5f}")
print(f"Degrees of Freedom (df): {df_welch:.3f}") # Added print statement
print("=" * 50)
if p_value < ALPHA:
    rejection_statement = "P-value is less than Alpha."
    rejection_decision = "Decision: REJECT the Null Hypothesis (H0)."
    conclusion = "Conclusion: There IS a statistically significant relationship between GDP and news Frequency."
else:
    rejection_statement = "P-value is greater than Alpha."
    rejection_decision = "Decision: FAIL TO REJECT the Null Hypothesis (H0)."
    conclusion = "Conclusion: We do not have sufficient evidence to claim a relationship between GDP and news Frequency."

print(rejection_statement)
print(rejection_decision)
print(conclusion)


  RESULTS OF WELCH'S T-TEST (HIGH GDP vs LOW GDP)  
Group Sizes: High GDP (n=83), Low GDP (n=83)
Mean Frequency - High GDP: 23212.00
Mean Frequency - Low GDP: 8170.24
--------------------------------------------------
T-Statistic: 5.412
P-value: 0.00000
Degrees of Freedom (df): 102.886
P-value is less than Alpha.
Decision: REJECT the Null Hypothesis (H0).
Conclusion: There IS a statistically significant relationship between GDP and news Frequency.


In [19]:
import pandas as pd
from scipy.stats import spearmanr

# 1. Load the data
df = pd.read_csv('final_country_data.csv')

# 2. Select variables and handle missing values to get the true sample size (n)
df_corr = df[['Average_Democracy_Score', 'Frequency']].dropna()

democracy = df_corr['Average_Democracy_Score']
frequency = df_corr['Frequency']

# 3. Calculate Sample Size (n) and Degrees of Freedom (df)
n = len(df_corr)
# df = n - 2 is used for the t-approximation for correlation significance
df_spearman = n - 2

# 4. Perform Spearman's Rank Correlation Test
rho, p_value_rho = spearmanr(democracy, frequency)

# 5. Output the results



print("[TEST 1: SPEARMAN'S RANK CORRELATION]")
print("=" * 60)
print(f"Sample Size (n): {n}")
print(f"Degrees of Freedom (df): {df_spearman}")
print("-" * 50)
print(f"Correlation Coefficient (Rho): {rho:.3f}")
print(f"P-value: {p_value_rho:.5f}")
print("=" * 60)

[TEST 1: SPEARMAN'S RANK CORRELATION]
Sample Size (n): 166
Degrees of Freedom (df): 164
--------------------------------------------------
Correlation Coefficient (Rho): 0.257
P-value: 0.00084


In [13]:
# --- TEST 2: DIFFERENCE OF MEANS (Democracy Split vs. Frequency) ---

# 2a. Sort the data by Democracy Score
df_sorted = df.sort_values(by='Average_Democracy_Score', ascending=False).reset_index(drop=True)

# 2b. Split the data in half (median split)
N = len(df_sorted)
midpoint = N // 2
group_high_demo_freq = df_sorted.head(midpoint)['Frequency']
group_low_demo_freq = df_sorted.tail(midpoint)['Frequency']

# 2c. Perform Welch's t-test
t_statistic, p_value_t = ttest_ind(group_high_demo_freq, 
                                    group_low_demo_freq, 
                                    equal_var=False)

# 2d. Calculate Degrees of Freedom (df) for Welch's test
n1, n2 = len(group_high_demo_freq), len(group_low_demo_freq)
s1_sq = group_high_demo_freq.var(ddof=1)
s2_sq = group_low_demo_freq.var(ddof=1)
numerator = (s1_sq / n1 + s2_sq / n2)**2
denominator = (s1_sq / n1)**2 / (n1 - 1) + (s2_sq / n2)**2 / (n2 - 1)
df_welch = numerator / denominator
print("\n[TEST 2: DIFFERENCE OF MEANS (Welch's t-test)]")
print(f"Mean Frequency - High Democracy Group: {group_high_demo_freq.mean():.2f}")
print(f"Mean Frequency - Low Democracy Group: {group_low_demo_freq.mean():.2f}")
print("-" * 50)
print(f"T-Statistic: {t_statistic:.3f}")
print(f"P-value: {p_value_t:.5f}")
print(f"Degrees of Freedom (df): {df_welch:.3f}")
print("=" * 60)


[TEST 2: DIFFERENCE OF MEANS (Welch's t-test)]
Mean Frequency - High Democracy Group: 18748.29
Mean Frequency - Low Democracy Group: 12633.95
--------------------------------------------------
T-Statistic: 2.052
P-value: 0.04189
Degrees of Freedom (df): 150.705


In [1]:
import pandas as pd
import scipy.stats as stats
import os

# 1. Load Data
try:
    script_dir = os.path.dirname(os.path.abspath(__file__))
except NameError:
    script_dir = os.getcwd()

file_path = os.path.join(script_dir, 'final_complete_dataset.csv')
df = pd.read_csv(file_path)

# 2. Prepare Data (Higher Score = Stronger Military)
# We invert PowerIndex because normally 0.0 is perfect strength.
df['Military_Strength'] = 1 / (df['PowerIndex'] + 0.0001)

# 3. Run Spearman Correlation Test
corr, p_value = stats.spearmanr(df['Military_Strength'], df['Frequency'])

print("--- Spearman Correlation Test Results ---")
print(f"Correlation Coefficient: {corr:.3f}")
print(f"P-Value: {p_value:.3e}")

if p_value < 0.05:
    print("CONCLUSION: Statistically SIGNIFICANT relationship.")
    print("Countries with stronger militaries are mentioned more frequently.")
else:
    print("CONCLUSION: No significant relationship found.")

--- Spearman Correlation Test Results ---
Correlation Coefficient: 0.654
P-Value: 4.879e-19
CONCLUSION: Statistically SIGNIFICANT relationship.
Countries with stronger militaries are mentioned more frequently.


In [2]:
import pandas as pd
import scipy.stats as stats
import os

# 1. Load Data
try:
    script_dir = os.path.dirname(os.path.abspath(__file__))
except NameError:
    script_dir = os.getcwd()

file_path = os.path.join(script_dir, 'final_complete_dataset.csv')
df = pd.read_csv(file_path)

# 2. Prepare Data
df['Military_Strength'] = 1 / (df['PowerIndex'] + 0.0001)

# 3. Split into Two Groups (High vs. Low)
median_strength = df['Military_Strength'].median()

group_strong = df[df['Military_Strength'] >= median_strength]['Frequency']
group_weak = df[df['Military_Strength'] < median_strength]['Frequency']

# 4. Run T-Test (Independent Samples)
# We use equal_var=False (Welch's t-test) because variance often differs between groups
t_stat, p_value = stats.ttest_ind(group_strong, group_weak, equal_var=False)

print("--- Two-Sample T-Test Results ---")
print(f"Median Split Point: {median_strength:.2f}")
print(f"Mean Frequency (Strong Group): {group_strong.mean():,.0f}")
print(f"Mean Frequency (Weak Group):   {group_weak.mean():,.0f}")
print(f"T-Statistic: {t_stat:.3f}")
print(f"P-Value: {p_value:.3e}")

if p_value < 0.05:
    print("CONCLUSION: Statistically SIGNIFICANT difference.")
    print("The 'Strong Military' group receives significantly more coverage.")
else:
    print("CONCLUSION: No significant difference between the groups.")

--- Two-Sample T-Test Results ---
Median Split Point: 0.72
Mean Frequency (Strong Group): 25,085
Mean Frequency (Weak Group):   9,370
T-Statistic: 5.104
P-Value: 1.729e-06
CONCLUSION: Statistically SIGNIFICANT difference.
The 'Strong Military' group receives significantly more coverage.
