In [5]:
import string
import requests
import json
import pandas as pd
from textblob import TextBlob
import nltk
from nltk.corpus import stopwords
from langdetect import detect
import mysql.connector
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /home/intn122@episourcein.episource.com/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/intn122@episourcein.episource.com/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [6]:
def clean_text(text):
    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])
    tokens = nltk.word_tokenize(text)
    tokens = [token for token in tokens if token not in stopwords.words('english')]
    return ' '.join(tokens)

def analyze_sentiment(text):
    return TextBlob(text).sentiment

def categorize_sentiment(polarity):
    if polarity > 0.1:
        return 'Positive'
    elif polarity < -0.1:
        return 'Negative'
    else:
        return 'Neutral'

def fetch_news(api_key, query, from_date, to_date):
    base_url = "https://newsapi.org/v2/everything"
    params = {
        "q": query,
        "from": from_date,
        "to": to_date,
        "sortBy": "publishedAt",
        "apiKey": api_key
    }
    response = requests.get(base_url, params=params)
    return response.json()

def save_to_json(filename, data):
    with open(filename, 'w') as file:
        json.dump(data, file)

def json_to_df(filename):
    with open(filename, 'r') as file:
        data = json.load(file)
    return pd.DataFrame(data['articles'])

def is_english(text):
    try:
        return detect(text) == 'en'
    except:
        return False

def insert_into_db(df,db_config):
    conn=mysql.connector.connect(**db_config)
    cursor=conn.cursor()
    insert_query="""
    INSERT INTO news_articles(id , tittle, content , sentiment , published_at)
    VALUES(%s , %s, %s, %s , %s)
    """
    for _, row in df.iterrows():
        values=(row['title'] , row['content'] , row['sentiment'] , row['publishedAt'])
        cursor.execute(insert_query, values)
        
    conn.commit()
    cursor.close()
    conn.close()

In [7]:
api_key = "0b2702ac89d64fdaac467ab4fc047639"  
query = "India OR Maldives"
from_date = "2024-01-01"
to_date = "2024-01-21"

news_data = fetch_news(api_key, query, from_date, to_date)
save_to_json('news_data.json', news_data)

df = json_to_df('news_data.json')
df = df[df['content'].apply(lambda x: is_english(x) if x else False)]
df['cleaned_content'] = df['content'].apply(lambda x: clean_text(x) if x else '')
df['sentiment_analysis'] = df['cleaned_content'].apply(lambda x: analyze_sentiment(x))
df['sentiment'] = df['sentiment_analysis'].apply(lambda x: categorize_sentiment(x.polarity))
df['publishedAt']=df['publishedAt'].apply(lambda x: x[:10])
df['publishedAt']=df['publishedAt'].astype('datetime64[ns]')
print(df)
db_config={
    'host': 'localhost',
    'user': 'root',
    'port':"3306",
    'database': 'news_data_db'
}
insert_into_db(df, db_config)

                                               source             author  \
0                     {'id': None, 'name': 'TheWrap'}    Stephanie Kaloi   
1   {'id': 'the-times-of-india', 'name': 'The Time...   Yogesh Kabirdoss   
2           {'id': None, 'name': 'InvestorsObserver'}  InvestorsObserver   
3                 {'id': None, 'name': 'Technews.tw'}             中央廣播電台   
4   {'id': 'the-times-of-india', 'name': 'The Time...       Sandip Dighe   
..                                                ...                ...   
92                   {'id': None, 'name': 'Livemint'}     Suneera Tandon   
93  {'id': 'the-times-of-india', 'name': 'The Time...          ET Bureau   
97  {'id': 'the-times-of-india', 'name': 'The Time...                PTI   
98                  {'id': None, 'name': 'NDTV News'}               None   
99                  {'id': None, 'name': 'Aajtak.in'}      अर्पिता आर्या   

                                                title  \
0   Donald Trump Defends Thinl

ProgrammingError: Not enough parameters for the SQL statement