In [None]:
# Simple EDA and linear regression on small, relatively clean, 2 variable dataset exploring the relationship between crickets
# chirping (chirps/sec)
# and the temperature.
# I have imported the .csv into a Pandas dataframe and created new headings as well as converting the temperature column
# to Celsius instead of Farenheit.

# Hypothesis: The chirping frequency of crickets depends on the temperature
# Null hypothesis: There is no relationship between the temperature and the chirp frequency of crickets

# Dependent variable: Y is the chirp frequency of the triped ground cricket
# Independent variable: X is the temperature

import pandas as pd
import numpy as np
from scipy import stats
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()


columnNames = ['Temp','Chirps'] # New column names to replace X and Y
df = pd.read_csv('crickets.csv') # Importing csv
df.columns = columnNames # Replacing column names
df['Temp'] = df['Temp'].apply(lambda x: (x-32) * 5/9); #Converting farenheit to celcius
x_vals,y_vals = df['Temp'],df['Chirps']

print(df.describe()) # print summary statistics


plt.scatter(x_vals,y_vals) # scatter plot
plt.title('Chirps per Second vs Temperature')
plt.xlabel('Temperature (C)')
plt.ylabel('Chirps / s')
plt.savefig('linregPlot.png')


slope, intercept, r_value, p_value, std_err = stats.linregress(x_vals, y_vals)


xTheor = np.array([x_vals.min(),x_vals.max()])
yTheor = slope*xTheor + intercept

plt.plot(xTheor,yTheor,color='k')
plt.show()

print("R-Squared = " + str(r_value**2))
print("p = " + str(p_value))
